Total coverage: 15608 (2%)of 1195164
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_CLOCK_H #define _LINUX_SCHED_CLOCK_H #include <linux/smp.h> /* * Do not use outside of architecture code which knows its limitations. * * sched_clock() has no promise of monotonicity or bounded drift between * CPUs, use (which you should not) requires disabling IRQs. * * Please use one of the three interfaces below. */ extern u64 sched_clock(void); #if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) extern u64 sched_clock_noinstr(void); #else static __always_inline u64 sched_clock_noinstr(void) { return sched_clock(); } #endif /* * See the comment in kernel/sched/clock.c */ extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline void sched_clock_tick(void) { } static inline void clear_sched_clock_stable(void) { } static inline void sched_clock_idle_sleep_event(void) { } static inline void sched_clock_idle_wakeup_event(void) { } static inline u64 cpu_clock(int cpu) { return sched_clock(); } static __always_inline u64 local_clock_noinstr(void) { return sched_clock_noinstr(); } static __always_inline u64 local_clock(void) { return sched_clock(); } #else extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); /* * When sched_clock_stable(), __sched_clock_offset provides the offset * between local_clock() and sched_clock(). */ extern u64 __sched_clock_offset; extern void sched_clock_tick(void); extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond * time source that is monotonic per cpu argument and has bounded drift * between cpus. * * ######################### BIG FAT WARNING ########################## * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # * # go backwards !! # * #################################################################### */ static inline u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } extern u64 local_clock_noinstr(void); extern u64 local_clock(void); #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * An i/f to runtime opt-in for irq time accounting based off of sched_clock. * The reason for this explicit opt-in is not to have perf penalty with * slow sched_clocks. */ extern void enable_sched_clock_irqtime(void); extern void disable_sched_clock_irqtime(void); #else static inline void enable_sched_clock_irqtime(void) {} static inline void disable_sched_clock_irqtime(void) {} #endif #endif /* _LINUX_SCHED_CLOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
54 54 151 1 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/pgalloc.h * * Copyright (C) 2000-2001 Russell King * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_PGALLOC_H #define __ASM_PGALLOC_H #include <asm/pgtable-hwdef.h> #include <asm/processor.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #define __HAVE_ARCH_PGD_FREE #define __HAVE_ARCH_PUD_FREE #include <asm-generic/pgalloc.h> #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) { set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot)); } static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF; pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN; __pud_populate(pudp, __pa(pmdp), pudval); } #else static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) { BUILD_BUG(); } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { if (pgtable_l4_enabled()) set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot)); } static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF; p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN; __p4d_populate(p4dp, __pa(pudp), p4dval); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { if (!pgtable_l4_enabled()) return; __pud_free(mm, pud); } #else static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) { BUILD_BUG(); } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #if CONFIG_PGTABLE_LEVELS > 4 static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) { if (pgtable_l5_enabled()) set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot)); } static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) { pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF; pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN; __pgd_populate(pgdp, __pa(p4dp), pgdval); } #else static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) { BUILD_BUG(); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep, pmdval_t prot) { set_pmd(pmdp, __pmd(__phys_to_pmd_val(ptep) | prot)); } /* * Populate the pmdp entry with a pointer to the pte. This pmd is part * of the mm address space. */ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { VM_BUG_ON(mm && mm != &init_mm); __pmd_populate(pmdp, __pa(ptep), PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { VM_BUG_ON(mm == &init_mm); __pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN); } #endif
172 37 162 6 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/spinlock.h> #include <linux/atomic.h> /* * This is an implementation of the notion of "decrement a * reference count, and return locked if it decremented to zero". * * NOTE NOTE NOTE! This is _not_ equivalent to * * if (atomic_dec_and_test(&atomic)) { * spin_lock(&lock); * return 1; * } * return 0; * * because the spin-lock and the decrement must be * "atomic". */ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock); int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock(lock); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock); int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags) { /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ if (atomic_add_unless(atomic, -1, 1)) return 0; /* Otherwise do it the slow way */ raw_spin_lock_irqsave(lock, *flags); if (atomic_dec_and_test(atomic)) return 1; raw_spin_unlock_irqrestore(lock, *flags); return 0; } EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);
273 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0-only #include <linux/interval_tree.h> #include <linux/interval_tree_generic.h> #include <linux/compiler.h> #include <linux/export.h> #define START(node) ((node)->start) #define LAST(node) ((node)->last) INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long, __subtree_last, START, LAST,, interval_tree) EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); #ifdef CONFIG_INTERVAL_TREE_SPAN_ITER /* * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous * span of nodes. This makes nodes[0]->last the end of that contiguous used span * of indexes that started at the original nodes[1]->start. * * If there is an interior hole, nodes[1] is now the first node starting the * next used span. A hole span is between nodes[0]->last and nodes[1]->start. * * If there is a tailing hole, nodes[1] is now NULL. A hole span is between * nodes[0]->last and last_index. * * If the contiguous used range span to last_index, nodes[1] is set to NULL. */ static void interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state) { struct interval_tree_node *cur = state->nodes[1]; state->nodes[0] = cur; do { if (cur->last > state->nodes[0]->last) state->nodes[0] = cur; cur = interval_tree_iter_next(cur, state->first_index, state->last_index); } while (cur && (state->nodes[0]->last >= cur->start || state->nodes[0]->last + 1 == cur->start)); state->nodes[1] = cur; } void interval_tree_span_iter_first(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long first_index, unsigned long last_index) { iter->first_index = first_index; iter->last_index = last_index; iter->nodes[0] = NULL; iter->nodes[1] = interval_tree_iter_first(itree, first_index, last_index); if (!iter->nodes[1]) { /* No nodes intersect the span, whole span is hole */ iter->start_hole = first_index; iter->last_hole = last_index; iter->is_hole = 1; return; } if (iter->nodes[1]->start > first_index) { /* Leading hole on first iteration */ iter->start_hole = first_index; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); return; } /* Starting inside a used */ iter->start_used = first_index; iter->is_hole = 0; interval_tree_span_iter_next_gap(iter); iter->last_used = iter->nodes[0]->last; if (iter->last_used >= last_index) { iter->last_used = last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } } EXPORT_SYMBOL_GPL(interval_tree_span_iter_first); void interval_tree_span_iter_next(struct interval_tree_span_iter *iter) { if (!iter->nodes[0] && !iter->nodes[1]) { iter->is_hole = -1; return; } if (iter->is_hole) { iter->start_used = iter->last_hole + 1; iter->last_used = iter->nodes[0]->last; if (iter->last_used >= iter->last_index) { iter->last_used = iter->last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } iter->is_hole = 0; return; } if (!iter->nodes[1]) { /* Trailing hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->last_index; iter->nodes[0] = NULL; iter->is_hole = 1; return; } /* must have both nodes[0] and [1], interior hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_next); /* * Advance the iterator index to a specific position. The returned used/hole is * updated to start at new_index. This is faster than calling * interval_tree_span_iter_first() as it can avoid full searches in several * cases where the iterator is already set. */ void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long new_index) { if (iter->is_hole == -1) return; iter->first_index = new_index; if (new_index > iter->last_index) { iter->is_hole = -1; return; } /* Rely on the union aliasing hole/used */ if (iter->start_hole <= new_index && new_index <= iter->last_hole) { iter->start_hole = new_index; return; } if (new_index == iter->last_hole + 1) interval_tree_span_iter_next(iter); else interval_tree_span_iter_first(iter, itree, new_index, iter->last_index); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance); #endif
40 5 35 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2023 ARM Ltd. */ #include <linux/mm.h> #include <linux/efi.h> #include <linux/export.h> #include <asm/tlbflush.h> static inline bool mm_is_user(struct mm_struct *mm) { /* * Don't attempt to apply the contig bit to kernel mappings, because * dynamically adding/removing the contig bit can cause page faults. * These racing faults are ok for user space, since they get serialized * on the PTL. But kernel mappings can't tolerate faults. */ if (unlikely(mm_is_efi(mm))) return false; return mm != &init_mm; } static inline pte_t *contpte_align_down(pte_t *ptep) { return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { /* * Unfold any partially covered contpte block at the beginning and end * of the range. */ if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); if (ptep + nr != contpte_align_down(ptep + nr)) { unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); pte_t *last_ptep = ptep + nr - 1; contpte_try_unfold(mm, last_addr, last_ptep, __ptep_get(last_ptep)); } } static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long start_addr; pte_t *start_ptep; int i; start_ptep = ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); if (pte_dirty(ptent)) pte = pte_mkdirty(pte); if (pte_young(ptent)) pte = pte_mkyoung(pte); } __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * We have already checked that the virtual and pysical addresses are * correctly aligned for a contpte mapping in contpte_try_fold() so the * remaining checks are to ensure that the contpte range is fully * covered by a single folio, and ensure that all the ptes are valid * with contiguous PFNs and matching prots. We ignore the state of the * access and dirty bits for the purpose of deciding if its a contiguous * range; the folding process will generate a single contpte entry which * has a single access and dirty bit. Those 2 bits are the logical OR of * their respective bits in the constituent pte entries. In order to * ensure the contpte range is covered by a single folio, we must * recover the folio from the pfn, but special mappings don't have a * folio backing them. Fortunately contpte_try_fold() already checked * that the pte is not special - we never try to fold special mappings. * Note we can't use vm_normal_page() for this since we don't have the * vma. */ unsigned long folio_start, folio_end; unsigned long cont_start, cont_end; pte_t expected_pte, subpte; struct folio *folio; struct page *page; unsigned long pfn; pte_t *orig_ptep; pgprot_t prot; int i; if (!mm_is_user(mm)) return; page = pte_page(pte); folio = page_folio(page); folio_start = addr - (page - &folio->page) * PAGE_SIZE; folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); cont_end = cont_start + CONT_PTE_SIZE; if (folio_start > cont_start || folio_end < cont_end) return; pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); expected_pte = pfn_pte(pfn, prot); orig_ptep = ptep; ptep = contpte_align_down(ptep); for (i = 0; i < CONT_PTES; i++) { subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); if (!pte_same(subpte, expected_pte)) return; expected_pte = pte_advance_pfn(expected_pte, 1); ptep++; } pte = pte_mkcont(pte); contpte_convert(mm, addr, orig_ptep, pte); } EXPORT_SYMBOL_GPL(__contpte_try_fold); void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * We have already checked that the ptes are contiguous in * contpte_try_unfold(), so just check that the mm is user space. */ if (!mm_is_user(mm)) return; pte = pte_mknoncont(pte); contpte_convert(mm, addr, ptep, pte); } EXPORT_SYMBOL_GPL(__contpte_try_unfold); pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) { /* * Gather access/dirty bits, which may be populated in any of the ptes * of the contig range. We are guaranteed to be holding the PTL, so any * contiguous range cannot be unfolded or otherwise modified under our * feet. */ pte_t pte; int i; ptep = contpte_align_down(ptep); for (i = 0; i < CONT_PTES; i++, ptep++) { pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get); pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* * The ptep_get_lockless() API requires us to read and return *orig_ptep * so that it is self-consistent, without the PTL held, so we may be * racing with other threads modifying the pte. Usually a READ_ONCE() * would suffice, but for the contpte case, we also need to gather the * access and dirty bits from across all ptes in the contiguous block, * and we can't read all of those neighbouring ptes atomically, so any * contiguous range may be unfolded/modified/refolded under our feet. * Therefore we ensure we read a _consistent_ contpte range by checking * that all ptes in the range are valid and have CONT_PTE set, that all * pfns are contiguous and that all pgprots are the same (ignoring * access/dirty). If we find a pte that is not consistent, then we must * be racing with an update so start again. If the target pte does not * have CONT_PTE set then that is considered consistent on its own * because it is not part of a contpte range. */ pgprot_t orig_prot; unsigned long pfn; pte_t orig_pte; pgprot_t prot; pte_t *ptep; pte_t pte; int i; retry: orig_pte = __ptep_get(orig_ptep); if (!pte_valid_cont(orig_pte)) return orig_pte; orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); ptep = contpte_align_down(orig_ptep); pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { pte = __ptep_get(ptep); prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); if (!pte_valid_cont(pte) || pte_pfn(pte) != pfn || pgprot_val(prot) != pgprot_val(orig_prot)) goto retry; if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } return orig_pte; } EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned long next; unsigned long end; unsigned long pfn; pgprot_t prot; /* * The set_ptes() spec guarantees that when nr > 1, the initial state of * all ptes is not-present. Therefore we never need to unfold or * otherwise invalidate a range before we set the new ptes. * contpte_set_ptes() should never be called for nr < 2. */ VM_WARN_ON(nr == 1); if (!mm_is_user(mm)) return __set_ptes(mm, addr, ptep, pte, nr); end = addr + (nr << PAGE_SHIFT); pfn = pte_pfn(pte); prot = pte_pgprot(pte); do { next = pte_cont_addr_end(addr, end); nr = (next - addr) >> PAGE_SHIFT; pte = pfn_pte(pfn, prot); if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) pte = pte_mkcont(pte); else pte = pte_mknoncont(pte); __set_ptes(mm, addr, ptep, pte, nr); addr = next; ptep += nr; pfn += nr; } while (addr != end); } EXPORT_SYMBOL_GPL(contpte_set_ptes); void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { contpte_try_unfold_partial(mm, addr, ptep, nr); __clear_full_ptes(mm, addr, ptep, nr, full); } EXPORT_SYMBOL_GPL(contpte_clear_full_ptes); pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { contpte_try_unfold_partial(mm, addr, ptep, nr); return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); } EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * ptep_clear_flush_young() technically requires us to clear the access * flag for a _single_ pte. However, the core-mm code actually tracks * access/dirty per folio, not per page. And since we only create a * contig range when the range is covered by a single folio, we can get * away with clearing young for the whole contig range here, so we avoid * having to unfold. */ int young = 0; int i; ptep = contpte_align_down(ptep); addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) young |= __ptep_test_and_clear_young(vma, addr, ptep); return young; } EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int young; young = contpte_ptep_test_and_clear_young(vma, addr, ptep); if (young) { /* * See comment in __ptep_clear_flush_young(); same rationale for * eliding the trailing DSB applies here. */ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); __flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE, PAGE_SIZE, true, 3); } return young; } EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { /* * If wrprotecting an entire contig range, we can avoid unfolding. Just * set wrprotect and wait for the later mmu_gather flush to invalidate * the tlb. Until the flush, the page may or may not be wrprotected. * After the flush, it is guaranteed wrprotected. If it's a partial * range though, we must unfold, because we can't have a case where * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this * would cause it to continue to be unpredictable after the flush. */ contpte_try_unfold_partial(mm, addr, ptep, nr); __wrprotect_ptes(mm, addr, ptep, nr); } EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { /* * We can safely clear access/dirty without needing to unfold from * the architectures perspective, even when contpte is set. If the * range starts or ends midway through a contpte block, we can just * expand to include the full contpte block. While this is not * exactly what the core-mm asked for, it tracks access/dirty per * folio, not per page. And since we only create a contpte block * when it is covered by a single folio, we can get away with * clearing access/dirty for the whole block. */ unsigned long start = addr; unsigned long end = start + nr * PAGE_SIZE; if (pte_cont(__ptep_get(ptep + nr - 1))) end = ALIGN(end, CONT_PTE_SIZE); if (pte_cont(__ptep_get(ptep))) { start = ALIGN_DOWN(start, CONT_PTE_SIZE); ptep = contpte_align_down(ptep); } __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags); } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) { unsigned long start_addr; pte_t orig_pte; int i; /* * Gather the access/dirty bits for the contiguous range. If nothing has * changed, its a noop. */ orig_pte = pte_mknoncont(ptep_get(ptep)); if (pte_val(orig_pte) == pte_val(entry)) return 0; /* * We can fix up access/dirty bits without having to unfold the contig * range. But if the write bit is changing, we must unfold. */ if (pte_write(orig_pte) == pte_write(entry)) { /* * For HW access management, we technically only need to update * the flag on a single pte in the range. But for SW access * management, we need to update all the ptes to prevent extra * faults. Avoid per-page tlb flush in __ptep_set_access_flags() * and instead flush the whole range at the end. */ ptep = contpte_align_down(ptep); start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); /* * We are not advancing entry because __ptep_set_access_flags() * only consumes access flags from entry. And since we have checked * for the whole contpte block and returned early, pte_same() * within __ptep_set_access_flags() is likely false. */ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) __ptep_set_access_flags(vma, addr, ptep, entry, 0); if (dirty) __flush_tlb_range(vma, start_addr, addr, PAGE_SIZE, true, 3); } else { __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); __ptep_set_access_flags(vma, addr, ptep, entry, dirty); } return 1; } EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags);
133 134 134 425 23 187 570 569 72 567 569 569 567 570 570 569 72 73 72 1 71 27 556 556 554 556 47 45 46 1 3 45 46 421 1600 1595 1 1601 1603 1610 1602 68 413 1565 1507 1569 4 4 52 17 70 71 67 17 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; /* * We may be racing against fd allocation from other threads using this * files_struct, despite holding ->file_lock. * * alloc_fd() might have already claimed a slot, while fd_install() * did not populate it yet. Note the latter operates locklessly, so * the file can show up as we are walking the array below. * * At the same time we know no files will disappear as all other * operations take the lock. * * Instead of trying to placate userspace racing with itself, we * ref the file if we see it and mark the fd slot as unused otherwise. */ for (i = open_files; i != 0; i--) { struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = &range; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS)) return false; if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) return true; if (file->f_op->iterate_shared) return true; return false; } bool file_seek_cur_needs_f_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; /* * Note that we are not guaranteed to be called after fdget_pos() on * this file obj, in which case the caller is expected to provide the * appropriate locking. */ return true; } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * dup2() is expected to close the file installed in the target fd slot * (if any). However, userspace hand-picking a fd may be racing against * its own threads which happened to allocate it in open() et al but did * not populate it yet. * * Broadly speaking we may be racing against the following: * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL * file = hard_work_goes_here(); * fd_install(fd, file); // only now ->fd[fd] == file * * It is an invariant that a successfully allocated fd has a NULL entry * in the array until the matching fd_install(). * * If we fit the window, we have the fd to populate, yet no target file * to close. Trying to ignore it and install our new file would violate * the invariant and make fd_install() overwrite our file. * * Things can be done(tm) to handle this. However, the issue does not * concern legitimate programs and we only need to make sure the kernel * does not trip over it. * * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
149 861 380 1 381 383 382 253 253 851 851 46 46 845 845 844 396 397 349 349 348 252 797 767 770 768 4 4 360 363 362 242 242 86 87 103 103 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * weighted interleave * Allocate memory interleaved over a set of nodes based on * a set of weights (per-node), with normal fallback if it * fails. Otherwise operates the same as interleave. * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated * on node 0 for every 1 page allocated on node 1. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include <linux/memory.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /* * weightiness balances the tradeoff between small weights (cycles through nodes * faster, more fair/even distribution) and large weights (smaller errors * between actual bandwidth ratios and weight ratios). 32 is a number that has * been found to perform at a reasonable compromise between the two goals. */ static const int weightiness = 32; /* * A null weighted_interleave_state is interpreted as having .mode="auto", * and .iw_table is interpreted as an array of 1s with length nr_node_ids. */ struct weighted_interleave_state { bool mode_auto; u8 iw_table[]; }; static struct weighted_interleave_state __rcu *wi_state; static unsigned int *node_bw_table; /* * wi_state_lock protects both wi_state and node_bw_table. * node_bw_table is only used by writers to update wi_state. */ static DEFINE_MUTEX(wi_state_lock); static u8 get_il_weight(int node) { struct weighted_interleave_state *state; u8 weight = 1; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) weight = state->iw_table[node]; rcu_read_unlock(); return weight; } /* * Convert bandwidth values into weighted interleave weights. * Call with wi_state_lock. */ static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) { u64 sum_bw = 0; unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; int nid; for_each_node_state(nid, N_MEMORY) sum_bw += bw[nid]; /* Scale bandwidths to whole numbers in the range [1, weightiness] */ for_each_node_state(nid, N_MEMORY) { /* * Try not to perform 64-bit division. * If sum_bw < scaling_factor, then sum_bw < U32_MAX. * If sum_bw > scaling_factor, then round the weight up to 1. */ scaling_factor = weightiness * bw[nid]; if (bw[nid] && sum_bw < scaling_factor) { cast_sum_bw = (unsigned int)sum_bw; new_iw[nid] = scaling_factor / cast_sum_bw; } else { new_iw[nid] = 1; } if (!iw_gcd) iw_gcd = new_iw[nid]; iw_gcd = gcd(iw_gcd, new_iw[nid]); } /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ for_each_node_state(nid, N_MEMORY) new_iw[nid] /= iw_gcd; } int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; unsigned int *old_bw, *new_bw; unsigned int bw_val; int i; bw_val = min(coords->read_bandwidth, coords->write_bandwidth); new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); if (!new_bw) return -ENOMEM; new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) { kfree(new_bw); return -ENOMEM; } new_wi_state->mode_auto = true; for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; /* * Update bandwidth info, even in manual mode. That way, when switching * to auto mode in the future, iw_table can be overwritten using * accurate bw data. */ mutex_lock(&wi_state_lock); old_bw = node_bw_table; if (old_bw) memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); new_bw[node] = bw_val; node_bw_table = new_bw; old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (old_wi_state && !old_wi_state->mode_auto) { /* Manual mode; skip reducing weights and updating wi_state */ mutex_unlock(&wi_state_lock); kfree(new_wi_state); goto out; } /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ reduce_interleave_weights(new_bw, new_wi_state->iw_table); rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } out: kfree(old_bw); return 0; } /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); /** * nearest_node_nodemask - Find the node in @mask at the nearest distance * from @node. * * @node: a valid node ID to start the search from. * @mask: a pointer to a nodemask representing the allowed nodes. * * This function iterates over all nodes in @mask and calculates the * distance from the starting @node, then it returns the node ID that is * the closest to @node, or MAX_NUMNODES if no node is found. * * Note that @node must be a valid node ID usable with node_distance(), * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes * or unexpected behavior. */ int nearest_node_nodemask(int node, nodemask_t *mask) { int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; for_each_node_mask(n, *mask) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(nearest_node_nodemask); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, [MPOL_WEIGHTED_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pmd_folio(*pmd); if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { max_nr = (end - addr) >> PAGE_SHIFT; nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio) && max_nr != 1) nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed += nr; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); } tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; current->il_weight = 0; } task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else if (pol == current->mempolicy && pol->mode == MPOL_WEIGHTED_INTERLEAVE) { if (current->il_weight) *policy = current->il_prev; else *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * See folio_maybe_mapped_shared() on possible imprecision when we * cannot easily detect if a folio is shared. */ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); if (unlikely(!vma)) { mmap_read_unlock(mm); return 0; } /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(folio, &pagelist, lru) { if (!folio_test_ksm(folio)) break; } if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(folio, folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= folio->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) *flags |= (MPOL_F_MOF | MPOL_F_MORON); else return -EINVAL; } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { unsigned int node; unsigned int cpuset_mems_cookie; retry: /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ cpuset_mems_cookie = read_mems_allowed_begin(); node = current->il_prev; if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; current->il_weight = get_il_weight(node); } current->il_weight--; return node; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; unsigned int cpuset_mems_cookie; /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nid = next_node_in(current->il_prev, policy->nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_WEIGHTED_INTERLEAVE: return weighted_interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; default: BUG(); } } static unsigned int read_once_policy_nodemask(struct mempolicy *pol, nodemask_t *mask) { /* * barrier stabilizes the nodemask locally so that it can be iterated * over safely without concern for changes. Allocators validate node * selection does not violate mems_allowed, so this is safe. */ barrier(); memcpy(mask, &pol->nodes, sizeof(nodemask_t)); barrier(); return nodes_weight(*mask); } static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) { struct weighted_interleave_state *state; nodemask_t nodemask; unsigned int target, nr_nodes; u8 *table = NULL; unsigned int weight_total = 0; u8 weight; int nid = 0; nr_nodes = read_once_policy_nodemask(pol, &nodemask); if (!nr_nodes) return numa_node_id(); rcu_read_lock(); state = rcu_dereference(wi_state); /* Uninitialized wi_state means we should assume all weights are 1 */ if (state) table = state->iw_table; /* calculate the total weight */ for_each_node_mask(nid, nodemask) weight_total += table ? table[nid] : 1; /* Calculate the node offset based on totals */ target = ilx % weight_total; nid = first_node(nodemask); while (target) { /* detect system default usage */ weight = table ? table[nid] : 1; if (target < weight) break; target -= weight; nid = next_node_in(nid, nodemask); } rcu_read_unlock(); return nid; } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: *nid = (ilx == NO_INTERLEAVE_INDEX) ? weighted_interleave_nodes(pol) : weighted_interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_frozen_pages_noprof( gfp | __GFP_THISNODE | __GFP_NORETRY, order, nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, nid); if (!page) return NULL; set_page_refcounted(page); return page_rmappable_folio(page); } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct folio *folio; if (vma->vm_flags & VM_DROPPABLE) gfp |= __GFP_NOWARN; pol = get_vma_policy(vma, addr, order, &ilx); folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return folio; } EXPORT_SYMBOL(vma_alloc_folio_noprof); struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_frozen_pages_noprof(gfp, order); if (page) set_page_refcounted(page); return page; } EXPORT_SYMBOL(alloc_pages_noprof); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { struct weighted_interleave_state *state; struct task_struct *me = current; unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; unsigned long node_pages, delta; u8 *weights, weight; unsigned int weight_total = 0; unsigned long rem_pages = nr_pages; nodemask_t nodes; int nnodes, node; int resume_node = MAX_NUMNODES - 1; u8 resume_weight = 0; int prev_node; int i; if (!nr_pages) return 0; /* read the nodes onto the stack, retry if done during rebind */ do { cpuset_mems_cookie = read_mems_allowed_begin(); nnodes = read_once_policy_nodemask(pol, &nodes); } while (read_mems_allowed_retry(cpuset_mems_cookie)); /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0; /* Continue allocating from most recent node and adjust the nr_pages */ node = me->il_prev; weight = me->il_weight; if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ if (rem_pages <= weight) { me->il_weight -= rem_pages; return total_allocated; } /* Otherwise we adjust remaining pages, continue from there */ rem_pages -= weight; } /* clear active weight in case of an allocation failure */ me->il_weight = 0; prev_node = node; /* create a local copy of node weights to operate on outside rcu */ weights = kzalloc(nr_node_ids, GFP_KERNEL); if (!weights) return total_allocated; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) { memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); rcu_read_unlock(); } else { rcu_read_unlock(); for (i = 0; i < nr_node_ids; i++) weights[i] = 1; } /* calculate total, detect system default usage */ for_each_node_mask(node, nodes) weight_total += weights[node]; /* * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. * Track which node weighted interleave should resume from. * * if (rounds > 0) and (delta == 0), resume_node will always be * the node following prev_node and its weight. */ rounds = rem_pages / weight_total; delta = rem_pages % weight_total; resume_node = next_node_in(prev_node, nodes); resume_weight = weights[resume_node]; for (i = 0; i < nnodes; i++) { node = next_node_in(prev_node, nodes); weight = weights[node]; node_pages = weight * rounds; /* If a delta exists, add this node's portion of the delta */ if (delta > weight) { node_pages += weight; delta -= weight; } else if (delta) { /* when delta is depleted, resume from that node */ node_pages += delta; resume_node = node; resume_weight = weight - delta; delta = 0; } /* node_pages can be 0 if an allocation fails and rounds == 0 */ if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) break; prev_node = node; } me->il_prev = resume_node; me->il_weight = resume_weight; kfree(weights); return total_allocated; } static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vmf: structure describing the fault * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; /* * Make sure ptl is held so that we don't preempt and we * have a stable smp processor id */ lockdep_assert_held(vmf->ptl); pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_WEIGHTED_INTERLEAVE: polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: case MPOL_PREFERRED_MANY: /* * Even though MPOL_PREFERRED_MANY can allocate pages outside * policy nodemask we don't allow numa migration to nodes * outside policy nodemask for now. This is done so that if we * want demotion to slow memory to happen, before allocating * from some DRAM node say 'x', we will end up using a * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario * we should not promote to node 'x' from slow memory node. */ if (pol->flags & MPOL_F_MORON) { /* * Optimize placement among multiple nodes * via NUMA balancing */ if (node_isset(thisnid, pol->nodes)) break; goto out; } /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zonelist_node_idx(z); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 51 for the longest mode, "weighted * interleave", plus the longest flag flags, "relative|balancing", and to * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol >= &preferred_node_policy[0] && pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); if (flags & MPOL_F_NUMA_BALANCING) { if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) p += snprintf(p, buffer + maxlen - p, "|"); p += snprintf(p, buffer + maxlen - p, "balancing"); } } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } #ifdef CONFIG_SYSFS struct iw_node_attr { struct kobj_attribute kobj_attr; int nid; }; struct sysfs_wi_group { struct kobject wi_kobj; struct mutex kobj_lock; struct iw_node_attr *nattrs[]; }; static struct sysfs_wi_group *wi_group; static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct iw_node_attr *node_attr; u8 weight; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); weight = get_il_weight(node_attr->nid); return sysfs_emit(buf, "%d\n", weight); } static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; struct iw_node_attr *node_attr; u8 weight = 0; int i; node_attr = container_of(attr, struct iw_node_attr, kobj_attr); if (count == 0 || sysfs_streq(buf, "") || kstrtou8(buf, 0, &weight) || weight == 0) return -EINVAL; new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) return -ENOMEM; mutex_lock(&wi_state_lock); old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (old_wi_state) { memcpy(new_wi_state->iw_table, old_wi_state->iw_table, nr_node_ids * sizeof(u8)); } else { for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; } new_wi_state->iw_table[node_attr->nid] = weight; new_wi_state->mode_auto = false; rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } return count; } static ssize_t weighted_interleave_auto_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct weighted_interleave_state *state; bool wi_auto = true; rcu_read_lock(); state = rcu_dereference(wi_state); if (state) wi_auto = state->mode_auto; rcu_read_unlock(); return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); } static ssize_t weighted_interleave_auto_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; unsigned int *bw; bool input; int i; if (kstrtobool(buf, &input)) return -EINVAL; new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), GFP_KERNEL); if (!new_wi_state) return -ENOMEM; for (i = 0; i < nr_node_ids; i++) new_wi_state->iw_table[i] = 1; mutex_lock(&wi_state_lock); if (!input) { old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (!old_wi_state) goto update_wi_state; if (input == old_wi_state->mode_auto) { mutex_unlock(&wi_state_lock); return count; } memcpy(new_wi_state->iw_table, old_wi_state->iw_table, nr_node_ids * sizeof(u8)); goto update_wi_state; } bw = node_bw_table; if (!bw) { mutex_unlock(&wi_state_lock); kfree(new_wi_state); return -ENODEV; } new_wi_state->mode_auto = true; reduce_interleave_weights(bw, new_wi_state->iw_table); update_wi_state: rcu_assign_pointer(wi_state, new_wi_state); mutex_unlock(&wi_state_lock); if (old_wi_state) { synchronize_rcu(); kfree(old_wi_state); } return count; } static void sysfs_wi_node_delete(int nid) { struct iw_node_attr *attr; if (nid < 0 || nid >= nr_node_ids) return; mutex_lock(&wi_group->kobj_lock); attr = wi_group->nattrs[nid]; if (!attr) { mutex_unlock(&wi_group->kobj_lock); return; } wi_group->nattrs[nid] = NULL; mutex_unlock(&wi_group->kobj_lock); sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); kfree(attr->kobj_attr.attr.name); kfree(attr); } static void sysfs_wi_node_delete_all(void) { int nid; for (nid = 0; nid < nr_node_ids; nid++) sysfs_wi_node_delete(nid); } static void wi_state_free(void) { struct weighted_interleave_state *old_wi_state; mutex_lock(&wi_state_lock); old_wi_state = rcu_dereference_protected(wi_state, lockdep_is_held(&wi_state_lock)); if (!old_wi_state) { mutex_unlock(&wi_state_lock); return; } rcu_assign_pointer(wi_state, NULL); mutex_unlock(&wi_state_lock); synchronize_rcu(); kfree(old_wi_state); } static struct kobj_attribute wi_auto_attr = __ATTR(auto, 0664, weighted_interleave_auto_show, weighted_interleave_auto_store); static void wi_cleanup(void) { sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); sysfs_wi_node_delete_all(); wi_state_free(); } static void wi_kobj_release(struct kobject *wi_kobj) { kfree(wi_group); } static const struct kobj_type wi_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = wi_kobj_release, }; static int sysfs_wi_node_add(int nid) { int ret; char *name; struct iw_node_attr *new_attr; if (nid < 0 || nid >= nr_node_ids) { pr_err("invalid node id: %d\n", nid); return -EINVAL; } new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); if (!new_attr) return -ENOMEM; name = kasprintf(GFP_KERNEL, "node%d", nid); if (!name) { kfree(new_attr); return -ENOMEM; } sysfs_attr_init(&new_attr->kobj_attr.attr); new_attr->kobj_attr.attr.name = name; new_attr->kobj_attr.attr.mode = 0644; new_attr->kobj_attr.show = node_show; new_attr->kobj_attr.store = node_store; new_attr->nid = nid; mutex_lock(&wi_group->kobj_lock); if (wi_group->nattrs[nid]) { mutex_unlock(&wi_group->kobj_lock); ret = -EEXIST; goto out; } ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); if (ret) { mutex_unlock(&wi_group->kobj_lock); goto out; } wi_group->nattrs[nid] = new_attr; mutex_unlock(&wi_group->kobj_lock); return 0; out: kfree(new_attr->kobj_attr.attr.name); kfree(new_attr); return ret; } static int wi_node_notifier(struct notifier_block *nb, unsigned long action, void *data) { int err; struct memory_notify *arg = data; int nid = arg->status_change_nid; if (nid < 0) return NOTIFY_OK; switch (action) { case MEM_ONLINE: err = sysfs_wi_node_add(nid); if (err) pr_err("failed to add sysfs for node%d during hotplug: %d\n", nid, err); break; case MEM_OFFLINE: sysfs_wi_node_delete(nid); break; } return NOTIFY_OK; } static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) { int nid, err; wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), GFP_KERNEL); if (!wi_group) return -ENOMEM; mutex_init(&wi_group->kobj_lock); err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, "weighted_interleave"); if (err) goto err_put_kobj; err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); if (err) goto err_put_kobj; for_each_online_node(nid) { if (!node_state(nid, N_MEMORY)) continue; err = sysfs_wi_node_add(nid); if (err) { pr_err("failed to add sysfs for node%d during init: %d\n", nid, err); goto err_cleanup_kobj; } } hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); return 0; err_cleanup_kobj: wi_cleanup(); kobject_del(&wi_group->wi_kobj); err_put_kobj: kobject_put(&wi_group->wi_kobj); return err; } static int __init mempolicy_sysfs_init(void) { int err; static struct kobject *mempolicy_kobj; mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); if (!mempolicy_kobj) return -ENOMEM; err = add_weighted_interleave_group(mempolicy_kobj); if (err) goto err_kobj; return 0; err_kobj: kobject_del(mempolicy_kobj); kobject_put(mempolicy_kobj); return err; } late_initcall(mempolicy_sysfs_init); #endif /* CONFIG_SYSFS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @bio: the target &bio * * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ #include <asm/types.h> #include <asm/byteorder.h> #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #endif static inline int test_bit_le(int nr, const void *addr) { return test_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void set_bit_le(int nr, void *addr) { set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void clear_bit_le(int nr, void *addr) { clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __set_bit_le(int nr, void *addr) { __set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline void __clear_bit_le(int nr, void *addr) { __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_set_bit_le(int nr, void *addr) { return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int test_and_clear_bit_le(int nr, void *addr) { return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_set_bit_le(int nr, void *addr) { return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); } static inline int __test_and_clear_bit_le(int nr, void *addr) { return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); } #endif /* _ASM_GENERIC_BITOPS_LE_H_ */
25 24 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIGNAL_H #define _LINUX_SIGNAL_H #include <linux/bug.h> #include <linux/list.h> #include <linux/signal_types.h> #include <linux/string.h> struct task_struct; /* for sysctl */ extern int print_fatal_signals; static inline void copy_siginfo(kernel_siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*to)); } static inline void clear_siginfo(kernel_siginfo_t *info) { memset(info, 0, sizeof(*info)); } #define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo)) static inline void copy_siginfo_to_external(siginfo_t *to, const kernel_siginfo_t *from) { memcpy(to, from, sizeof(*from)); memset(((char *)to) + sizeof(struct kernel_siginfo), 0, SI_EXPANSION_SIZE); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from); int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from); enum siginfo_layout { SIL_KILL, SIL_TIMER, SIL_POLL, SIL_FAULT, SIL_FAULT_TRAPNO, SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, SIL_FAULT_PERF_EVENT, SIL_CHLD, SIL_RT, SIL_SYS, }; enum siginfo_layout siginfo_layout(unsigned sig, int si_code); /* * Define some primitives to manipulate sigset_t. */ #ifndef __HAVE_ARCH_SIG_BITOPS #include <linux/bitops.h> /* We don't use <linux/bitops.h> for these because there is no need to be atomic. */ static inline void sigaddset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] |= 1UL << sig; else set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } static inline void sigdelset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] &= ~(1UL << sig); else set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); } static inline int sigismember(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) return 1 & (set->sig[0] >> sig); else return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); } #endif /* __HAVE_ARCH_SIG_BITOPS */ static inline int sigisemptyset(sigset_t *set) { switch (_NSIG_WORDS) { case 4: return (set->sig[3] | set->sig[2] | set->sig[1] | set->sig[0]) == 0; case 2: return (set->sig[1] | set->sig[0]) == 0; case 1: return set->sig[0] == 0; default: BUILD_BUG(); return 0; } } static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) { switch (_NSIG_WORDS) { case 4: return (set1->sig[3] == set2->sig[3]) && (set1->sig[2] == set2->sig[2]) && (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 2: return (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 1: return set1->sig[0] == set2->sig[0]; } return 0; } #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS #define _SIG_SET_BINOP(name, op) \ static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \ { \ unsigned long a0, a1, a2, a3, b0, b1, b2, b3; \ \ switch (_NSIG_WORDS) { \ case 4: \ a3 = a->sig[3]; a2 = a->sig[2]; \ b3 = b->sig[3]; b2 = b->sig[2]; \ r->sig[3] = op(a3, b3); \ r->sig[2] = op(a2, b2); \ fallthrough; \ case 2: \ a1 = a->sig[1]; b1 = b->sig[1]; \ r->sig[1] = op(a1, b1); \ fallthrough; \ case 1: \ a0 = a->sig[0]; b0 = b->sig[0]; \ r->sig[0] = op(a0, b0); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_or(x,y) ((x) | (y)) _SIG_SET_BINOP(sigorsets, _sig_or) #define _sig_and(x,y) ((x) & (y)) _SIG_SET_BINOP(sigandsets, _sig_and) #define _sig_andn(x,y) ((x) & ~(y)) _SIG_SET_BINOP(sigandnsets, _sig_andn) #undef _SIG_SET_BINOP #undef _sig_or #undef _sig_and #undef _sig_andn #define _SIG_SET_OP(name, op) \ static inline void name(sigset_t *set) \ { \ switch (_NSIG_WORDS) { \ case 4: set->sig[3] = op(set->sig[3]); \ set->sig[2] = op(set->sig[2]); \ fallthrough; \ case 2: set->sig[1] = op(set->sig[1]); \ fallthrough; \ case 1: set->sig[0] = op(set->sig[0]); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_not(x) (~(x)) _SIG_SET_OP(signotset, _sig_not) #undef _SIG_SET_OP #undef _sig_not static inline void sigemptyset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, 0, sizeof(sigset_t)); break; case 2: set->sig[1] = 0; fallthrough; case 1: set->sig[0] = 0; break; } } static inline void sigfillset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, -1, sizeof(sigset_t)); break; case 2: set->sig[1] = -1; fallthrough; case 1: set->sig[0] = -1; break; } } /* Some extensions for manipulating the low 32 signals in particular. */ static inline void sigaddsetmask(sigset_t *set, unsigned long mask) { set->sig[0] |= mask; } static inline void sigdelsetmask(sigset_t *set, unsigned long mask) { set->sig[0] &= ~mask; } static inline int sigtestsetmask(sigset_t *set, unsigned long mask) { return (set->sig[0] & mask) != 0; } static inline void siginitset(sigset_t *set, unsigned long mask) { set->sig[0] = mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; break; case 1: ; } } static inline void siginitsetinv(sigset_t *set, unsigned long mask) { set->sig[0] = ~mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; break; case 1: ; } } #endif /* __HAVE_ARCH_SIG_SETOPS */ static inline void init_sigpending(struct sigpending *sig) { sigemptyset(&sig->signal); INIT_LIST_HEAD(&sig->list); } extern void flush_sigqueue(struct sigpending *queue); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) { return sig <= _NSIG ? 1 : 0; } struct timespec; struct pt_regs; enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); #define SIG_KTHREAD ((__force __sighandler_t)2) #define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) static inline void allow_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD); } static inline void allow_kernel_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know signals sent by the kernel will be handled, so that they * don't get silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) { kernel_sigaction(sig, SIG_IGN); } extern struct kmem_cache *sighand_cachep; extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) * or to the process as a whole (Linux thread group). How the signal * is sent determines whether it's to one thread or the whole group, * which determines which signal mask(s) are involved in blocking it * from being delivered until later. When the signal is delivered, * either it's caught or ignored by a user handler or it has a default * effect that applies to the whole thread group (POSIX process). * * The possible effects an unblocked signal set to SIG_DFL can have are: * ignore - Nothing Happens * terminate - kill the process, i.e. all threads in the group, * similar to exit_group. The group leader (only) reports * WIFSIGNALED status to its parent. * coredump - write a core dump file describing all threads using * the same mm and then kill all those threads * stop - stop all the threads in the group, i.e. TASK_STOPPED state * * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. * Other signals when not blocked and set to SIG_DFL behaves as follows. * The job control signals also have other special effects. * * +--------------------+------------------+ * | POSIX signal | default action | * +--------------------+------------------+ * | SIGHUP | terminate | * | SIGINT | terminate | * | SIGQUIT | coredump | * | SIGILL | coredump | * | SIGTRAP | coredump | * | SIGABRT/SIGIOT | coredump | * | SIGBUS | coredump | * | SIGFPE | coredump | * | SIGKILL | terminate(+) | * | SIGUSR1 | terminate | * | SIGSEGV | coredump | * | SIGUSR2 | terminate | * | SIGPIPE | terminate | * | SIGALRM | terminate | * | SIGTERM | terminate | * | SIGCHLD | ignore | * | SIGCONT | ignore(*) | * | SIGSTOP | stop(*)(+) | * | SIGTSTP | stop(*) | * | SIGTTIN | stop(*) | * | SIGTTOU | stop(*) | * | SIGURG | ignore | * | SIGXCPU | coredump | * | SIGXFSZ | coredump | * | SIGVTALRM | terminate | * | SIGPROF | terminate | * | SIGPOLL/SIGIO | terminate | * | SIGSYS/SIGUNUSED | coredump | * | SIGSTKFLT | terminate | * | SIGWINCH | ignore | * | SIGPWR | terminate | * | SIGRTMIN-SIGRTMAX | terminate | * +--------------------+------------------+ * | non-POSIX signal | default action | * +--------------------+------------------+ * | SIGEMT | coredump | * +--------------------+------------------+ * * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". * (*) Special job control effects: * When SIGCONT is sent, it resumes the process (all threads in the group) * from TASK_STOPPED state and also clears any pending/queued stop signals * (any of those marked with "stop(*)"). This happens regardless of blocking, * catching, or ignoring SIGCONT. When any stop signal is sent, it clears * any pending/queued SIGCONT signals; this happens regardless of blocking, * catching, or ignored the stop signal, though (except for SIGSTOP) the * default action of stopping the process may happen later or never. */ #ifdef SIGEMT #define SIGEMT_MASK rt_sigmask(SIGEMT) #else #define SIGEMT_MASK 0 #endif #if SIGRTMIN > BITS_PER_LONG #define rt_sigmask(sig) (1ULL << ((sig)-1)) #else #define rt_sigmask(sig) sigmask(sig) #endif #define siginmask(sig, mask) \ ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask))) #define SIG_KERNEL_ONLY_MASK (\ rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP)) #define SIG_KERNEL_STOP_MASK (\ rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \ rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) ) #define SIG_KERNEL_COREDUMP_MASK (\ rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \ rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \ rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \ rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \ SIGEMT_MASK ) #define SIG_KERNEL_IGNORE_MASK (\ rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) ) #define SIG_SPECIFIC_SICODES_MASK (\ rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \ rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \ SIGEMT_MASK ) #define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK) #define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK) #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) #define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK) #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) void signals_init(void); int restore_altstack(const stack_t __user *); int __save_altstack(stack_t __user *, unsigned long); #define unsafe_save_altstack(uss, sp, label) do { \ stack_t __user *__uss = uss; \ struct task_struct *t = current; \ unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \ unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \ unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); #ifdef CONFIG_DYNAMIC_SIGFRAME bool sigaltstack_size_valid(size_t ss_size); #else static inline bool sigaltstack_size_valid(size_t size) { return true; } #endif /* !CONFIG_DYNAMIC_SIGFRAME */ #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif #ifndef arch_untagged_si_addr /* * Given a fault address and a signal and si_code which correspond to the * _sigfault union member, returns the address that must appear in si_addr if * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags. */ static inline void __user *arch_untagged_si_addr(void __user *addr, unsigned long sig, unsigned long si_code) { return addr; } #endif #endif /* _LINUX_SIGNAL_H */
5 2 5 277 288 70 52 181 34 229 41 24 13 325 471 40 337 331 51 6 33 501 349 349 3 331 56 53 282 286 17 17 17 543 19 338 698 320 352 325 174 58 166 257 160 207 54 149 87 536 537 533 535 41 53 8 455 161 161 161 163 8 8 8 8 8 8 152 51 51 51 54 54 177 149 2 363 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/compiler.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/rcuwait.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 #if CONFIG_ARCH_PKEY_BITS > 3 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #else # define VM_PKEY_BIT3 0 #endif #if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif #if defined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif #ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 41 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) #elif defined(CONFIG_PPC32) #define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ /* * These must be here rather than mmap_lock.h as dependent on vm_fault type, * declared in this header. */ #ifdef CONFIG_PER_VMA_LOCK static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } #else static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; extern void prep_compound_page(struct page *page, unsigned int order); static inline unsigned int folio_large_order(const struct folio *folio) { return folio->_flags_1 & 0xff; } #ifdef NR_PAGES_IN_LARGE_FOLIO static inline long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else static inline long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } #endif /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio_large_order(folio); } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio_large_order(folio); } /** * folio_reset_order - Reset the folio order and derived _nr_pages * @folio: The folio. * * Reset the order and derived _nr_pages to 0. Must only be used in the * process of splitting large folios. */ static inline void folio_reset_order(struct folio *folio) { if (WARN_ON_ONCE(!folio_test_large(folio))) return; folio->_flags_1 &= ~0xffUL; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) return 0; return atomic_read(&folio->_entire_mapcount) + 1; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount; if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } return folio_large_mapcount(folio); } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(const struct folio *folio) { return folio_mapcount(folio) >= 1; } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(const struct page *page) { return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A folio may belong to an inode's memory mapping. In this case, * folio->mapping points to the inode, and folio->index is the file * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_test_slab(folio))) return; folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio_batch *folios) { folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); if (folio_test_slab(folio)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) { return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int folio_last_cpupid(struct folio *folio) { return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; last_time = folio_xchg_last_cpupid(folio, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } static inline int folio_last_cpupid(struct folio *folio) { return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } static inline bool folio_use_access_time(struct folio *folio) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } #ifdef CONFIG_MMU static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } /** * folio_mk_pte - Create a PTE for this folio * @folio: The folio to create a PTE for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_ptes(). * * Return: A page table entry suitable for mapping this folio. */ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE /** * folio_mk_pmd - Create a PMD for this folio * @folio: The folio to create a PMD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pmd_at(). * * Return: A page table entry suitable for mapping this folio. */ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } #endif #endif /* CONFIG_MMU */ static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) return folio_test_large(folio); return folio_order(folio) > 1; } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return folio_maybe_dma_pinned(folio); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* * Filesystems can only tolerate transient delays to truncate and * hole-punch operations */ if (folio_is_fsdax(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; return folio_large_nr_pages(folio); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_maybe_mapped_shared - Whether the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio maybe currently mapped into more than one * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped shared" (false positive) if a folio was mapped by * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM. */ static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); /* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; /* * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... * simply assume "mapped shared", nobody should really care * about this for arbitrary kernel allocations. */ if (!IS_ENABLED(CONFIG_MM_ID)) return true; /* * A single mapping implies "mapped exclusively", even if the * folio flag says something different: it's easier to handle this * case here instead of on the RMAP hot path. */ if (mapcount <= 1) return false; return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); } /** * folio_expected_ref_count - calculate the expected folio refcount * @folio: the folio * * Calculate the expected folio refcount, taking references from the pagecache, * swapcache, PG_private and page table mappings into account. Useful in * combination with folio_ref_count() to detect unexpected references (e.g., * GUP or other temporary references). * * Does currently not consider references from the LRU cache. If the folio * was isolated from the LRU (which is the case during migration or split), * the LRU cache does not apply. * * Calling this function on an unmapped folio -- !folio_mapped() -- that is * locked will return a stable result. * * Calling this function on a mapped folio will not result in a stable result, * because nothing stops additional page table mappings from coming (e.g., * fork()) or going (e.g., munmap()). * * Calling this function without the folio lock will also not result in a * stable result: for example, the folio might get dropped from the swapcache * concurrently. * * However, even when called without the folio lock or on a mapped folio, * this function can be used to detect unexpected references early (for example, * if it makes sense to even lock the folio and unmap it). * * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * * Returns the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { const int order = folio_order(folio); int ref_count = 0; if (WARN_ON_ONCE(folio_test_slab(folio))) return 0; if (folio_test_anon(folio)) { /* One reference per page from the swapcache. */ ref_count += folio_test_swapcache(folio) << order; } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ ref_count += folio_test_private(folio); } /* One reference per page table mapping. */ return ref_count + folio_mapcount(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { return 0; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); struct follow_pfnmap_args { /** * Inputs: * @vma: Pointer to @vm_area_struct struct * @address: the virtual address to walk */ struct vm_area_struct *vma; unsigned long address; /** * Internals: * * The caller shouldn't touch any of these. */ spinlock_t *lock; pte_t *ptep; /** * Outputs: * * @pfn: the PFN of the address * @addr_mask: address mask covering pfn * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; unsigned long addr_mask; pgprot_t pgprot; bool writable; bool special; }; int follow_pfnmap_start(struct follow_pfnmap_args *args); void follow_pfnmap_end(struct follow_pfnmap_args *args); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_folio(struct address_space *mapping, struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); #ifdef CONFIG_BPF_SYSCALL extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); #endif long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); /* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got; if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL); got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) { if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return false; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd; } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return false; } static inline pud_t pud_mkspecial(pud_t pud) { return pud; } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); return ptlock_ptr(virt_to_ptdesc(pte)); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) { pagetable_dtor(ptdesc); pagetable_free(ptdesc); } static inline bool pagetable_pte_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { if (mm != &init_mm && !ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; } pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pte_t *pte; __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); return pte; } static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(RCU, __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); return true; } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); static inline void show_mem(void) { __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); #define vma_interval_tree_foreach(vma, root, start, last) \ for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root); struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node); #endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, unsigned long end_data, unsigned long start_data) { if (rlim < RLIM_INFINITY) { if (((new - start) + (end_data - start_data)) > rlim) return -ENOSPC; } return 0; } extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 unsigned long flags; unsigned long length; unsigned long low_limit; unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise. */ static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { return mtree_load(&mm->mm_mt, addr); } static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; /* See reasoning around the VM_SHADOW_STACK definition */ if (vma->vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; vm_start -= gap; if (vm_start > vma->vm_start) vm_start = 0; return vm_start; } static inline unsigned long vm_end_gap(struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; if (vma->vm_flags & VM_GROWSUP) { vm_end += stack_guard_gap; if (vm_end < vma->vm_end) vm_end = -PAGE_SIZE; } return vm_end; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; return vma; } static inline bool range_in_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); } #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); } #endif void vma_set_file(struct vm_area_struct *vma, struct file *file); #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } #ifndef io_remap_pfn_range static inline int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); } #endif static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; else if (err == -EHWPOISON) return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } /* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers. */ static inline vm_fault_t vmf_fs_error(int err) { if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS; } static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0; } /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, unsigned int flags) { /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault. */ if (!(flags & FOLL_HONOR_NUMA_FAULT)) return true; /* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs. */ return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); extern bool _page_poisoning_enabled_early; DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); static inline bool page_poisoning_enabled(void) { return _page_poisoning_enabled_early; } /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early. */ static inline bool page_poisoning_enabled_static(void) { return static_branch_unlikely(&_page_poisoning_enabled); } static inline void kernel_poison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_poison_pages(page, numpages); } static inline void kernel_unpoison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_unpoison_pages(page, numpages); } #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) return true; return flags & __GFP_ZERO; } DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free); } extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled_early; } /* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; return static_branch_unlikely(&_debug_pagealloc_enabled); } /* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } static inline bool debug_guardpage_enabled(void) { return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { if (!debug_guardpage_enabled()) return false; return PageGuard(page); } bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return false; return __set_page_guard(zone, page, order); } void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return; __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } #endif /* __HAVE_ARCH_GATE_AREA */ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 #else extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); #ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); #else static inline void print_vma_addr(char *prefix, unsigned long rip) { } #endif void *sparse_buffer_alloc(unsigned long size); unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, unsigned long ptpfn, unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) return altmap->reserve + altmap->free; return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { altmap->alloc -= nr_pfns; } #else static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } #endif #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long nr_pages; unsigned long nr_vmemmap_pages; if (!pgmap || !is_power_of_2(sizeof(struct page))) return false; nr_pages = pgmap_vmemmap_nr(pgmap); nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst */ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } /* * If we don't have an architecture override, use the generic rule */ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { return false; } #endif enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { } static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void num_poisoned_pages_inc(unsigned long pfn) { } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { } #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); #else static inline void memblk_nr_poison_inc(unsigned long pfn) { } static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { } #endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) { return -ENXIO; } #endif #ifndef arch_is_platform_page static inline bool arch_is_platform_page(u64 paddr) { return false; } #endif /* * Error handlers for various types of pages. */ enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ MF_RECOVERED, /* Successfully recovered */ }; enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, MF_MSG_DIRTY_MLOCKED_LRU, MF_MSG_CLEAN_MLOCKED_LRU, MF_MSG_DIRTY_UNEVICTABLE_LRU, MF_MSG_CLEAN_UNEVICTABLE_LRU, MF_MSG_DIRTY_LRU, MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise. */ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) { return vma_is_dax(vma) || (vma->vm_file && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else static inline void setup_nr_node_ids(void) {} #endif extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) { return !memcmp_pages(page1, page2); } #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, pgoff_t bitmap_pgoff, unsigned long *bitmap, pgoff_t *start, pgoff_t *end); unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif #ifdef CONFIG_UNACCEPTED_MEMORY bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { return false; } static inline void accept_memory(phys_addr_t start, unsigned long size) { } #endif static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); int reserve_mem_release_by_name(const char *name); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); #else static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { /* noop on 32 bit */ return 0; } #endif /* * user_alloc_needs_zeroing checks if a user folio from page allocator needs to * be zeroed or not. */ static inline bool user_alloc_needs_zeroing(void) { /* * for user folios, arch with cache aliasing requires cache flush and * arc changes folio->flags to make icache coherent with dcache, so * always return false to make caller use * clear_user_page()/clear_user_highpage(). */ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc); } int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); /* * mseal of userspace process's system mappings. */ #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else #define VM_SEALED_SYSMAP VM_NONE #endif /* * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and * stashes it in the upper bits of page->pp_magic. We always want to be able to * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP * pages can have arbitrary kernel pointers stored in the same field as pp_magic * (since it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * * On architectures that set POISON_POINTER_DELTA, this is already ensured, * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is * 0, we make sure that we leave the two topmost bits empty, as that guarantees * we won't mistake a valid kernel pointer for a value we set, regardless of the * VMSPLIT setting. * * Altogether, this means that the number of bits available is constrained by * the size of an unsigned long (at the upper end, subtracting two bits per the * above), and the definition of PP_SIGNATURE (with or without * POISON_POINTER_DELTA). */ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA * index to not overlap with that if set */ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else /* Always leave out the topmost two; see above. */ #define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) #endif #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for * the head page of compound page and bit 1 for pfmemalloc page, as well as the * bits used for the DMA index. page_is_pfmemalloc() is checked in * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) { return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; } #else static inline bool page_pool_page_is_pp(struct page *page) { return false; } #endif #endif /* _LINUX_MM_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool_netlink.h> #include <linux/bitmap.h> #include "netlink.h" #include "bitset.h" /* Some bitmaps are internally represented as an array of unsigned long, some * as an array of u32 (some even as single u32 for now). To avoid the need of * wrappers on caller side, we provide two set of functions: those with "32" * suffix in their names expect u32 based bitmaps, those without it expect * unsigned long bitmaps. */ static u32 ethnl_lower_bits(unsigned int n) { return ~(u32)0 >> (32 - n % 32); } static u32 ethnl_upper_bits(unsigned int n) { return ~(u32)0 << (n % 32); } /** * ethnl_bitmap32_clear() - Clear u32 based bitmap * @dst: bitmap to clear * @start: beginning of the interval * @end: end of the interval * @mod: set if bitmap was modified * * Clear @nbits bits of a bitmap with indices @start <= i < @end */ static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end, bool *mod) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; unsigned int i; u32 mask; if (end <= start) return; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } return; } if (dst[start_word] & mask) { dst[start_word] &= ~mask; *mod = true; } start_word++; } for (i = start_word; i < end_word; i++) { if (dst[i]) { dst[i] = 0; *mod = true; } } if (end % 32) { mask = ethnl_lower_bits(end); if (dst[end_word] & mask) { dst[end_word] &= ~mask; *mod = true; } } } /** * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval * @map: bitmap to test * @start: beginning of the interval * @end: end of the interval * * Return: true if there is non-zero bit with index @start <= i < @end, * false if the whole interval is zero */ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, unsigned int end) { unsigned int start_word = start / 32; unsigned int end_word = end / 32; u32 mask; if (end <= start) return true; if (start % 32) { mask = ethnl_upper_bits(start); if (end_word == start_word) { mask &= ethnl_lower_bits(end); return map[start_word] & mask; } if (map[start_word] & mask) return true; start_word++; } if (!memchr_inv(map + start_word, '\0', (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) return true; return map[end_word] & ethnl_lower_bits(end); } /** * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask * pair * @dst: bitmap to update * @nbits: bit size of the bitmap * @value: values to set * @mask: mask of bits to set * @mod: set to true if bitmap is modified, preserve if not * * Set bits in @dst bitmap which are set in @mask to values from @value, leave * the rest untouched. If destination bitmap was modified, set @mod to true, * leave as it is if not. */ static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits, const u32 *value, const u32 *mask, bool *mod) { while (nbits > 0) { u32 real_mask = mask ? *mask : ~(u32)0; u32 new_value; if (nbits < 32) real_mask &= ethnl_lower_bits(nbits); new_value = (*dst & ~real_mask) | (*value & real_mask); if (new_value != *dst) { *dst = new_value; *mod = true; } if (nbits <= 32) break; dst++; nbits -= 32; value++; if (mask) mask++; } } static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index) { return map[index / 32] & (1U << (index % 32)); } /** * ethnl_bitset32_size() - Calculate size of bitset nested attribute * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: assume compact format for output * * Estimate length of netlink attribute composed by a later call to * ethnl_put_bitset32() call with the same arguments. * * Return: negative error code or attribute length estimate */ int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { unsigned int len = 0; /* list flag */ if (!mask) len += nla_total_size(sizeof(u32)); /* size */ len += nla_total_size(sizeof(u32)); if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); /* value, mask */ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32)); } else { unsigned int bits_len = 0; unsigned int bit_len, i; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; /* index */ bit_len = nla_total_size(sizeof(u32)); /* name */ if (name) bit_len += ethnl_strz_size(name); /* value */ if (mask && ethnl_bitmap32_test_bit(val, i)) bit_len += nla_total_size(0); /* bit nest */ bits_len += nla_total_size(bit_len); } /* bits nest */ len += nla_total_size(bits_len); } /* outermost nest */ return nla_total_size(len); } /** * ethnl_put_bitset32() - Put a bitset nest into a message * @skb: skb with the message * @attrtype: attribute type for the bitset nest * @val: value bitmap (u32 based) * @mask: mask bitmap (u32 based, optional) * @nbits: bit length of the bitset * @names: array of bit names (optional) * @compact: use compact format for the output * * Compose a nested attribute representing a bitset. If @mask is null, simple * bitmap (bit list) is created, if @mask is provided, represent a value/mask * pair. Bit names are only used in verbose mode and when provided by calller. * * Return: 0 on success, negative error value on error */ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val, const u32 *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { struct nlattr *nest; struct nlattr *attr; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK)) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits)) goto nla_put_failure; if (compact) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); u32 *dst; attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, val, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); if (mask) { attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes); if (!attr) goto nla_put_failure; dst = nla_data(attr); memcpy(dst, mask, nbytes); if (nbits % 32) dst[nwords - 1] &= ethnl_lower_bits(nbits); } } else { struct nlattr *bits; unsigned int i; bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS); if (!bits) goto nla_put_failure; for (i = 0; i < nbits; i++) { const char *name = names ? names[i] : NULL; if (!ethnl_bitmap32_test_bit(mask ?: val, i)) continue; attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT); if (!attr) goto nla_put_failure; if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i)) goto nla_put_failure; if (name && ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name)) goto nla_put_failure; if (mask && ethnl_bitmap32_test_bit(val, i) && nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE)) goto nla_put_failure; nla_nest_end(skb, attr); } nla_nest_end(skb, bits); } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct nla_policy bitset_policy[] = { [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, }; static const struct nla_policy bit_policy[] = { [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG }, }; /** * ethnl_bitset_is_compact() - check if bitset attribute represents a compact * bitset * @bitset: nested attribute representing a bitset * @compact: pointer for return value * * Return: 0 on success, negative error code on failure */ int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset, bitset_policy, NULL); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) { if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK]) return -EINVAL; *compact = false; return 0; } if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE]) return -EINVAL; *compact = true; return 0; } /** * ethnl_name_to_idx() - look up string index for a name * @names: array of ETH_GSTRING_LEN sized strings * @n_names: number of strings in the array * @name: name to look up * * Return: index of the string if found, -ENOENT if not found */ static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names, const char *name) { unsigned int i; if (!names) return -ENOENT; for (i = 0; i < n_names; i++) { /* names[i] may not be null terminated */ if (!strncmp(names[i], name, ETH_GSTRING_LEN) && strlen(name) <= ETH_GSTRING_LEN) return i; } return -ENOENT; } static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits, const struct nlattr *bit_attr, bool no_mask, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bit_policy)]; int ret, idx; ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr, bit_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) { const char *name; idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]); if (idx >= nbits) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_INDEX], "bit index too high"); return -EOPNOTSUPP; } name = names ? names[idx] : NULL; if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name && strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name, nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "bit index and name mismatch"); return -EINVAL; } } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) { idx = ethnl_name_to_idx(names, nbits, nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME])); if (idx < 0) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_BIT_NAME], "bit name not found"); return -EOPNOTSUPP; } } else { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "neither bit index nor name specified"); return -EINVAL; } *index = idx; *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE]; return 0; } /** * ethnl_bitmap32_equal() - Compare two bitmaps * @map1: first bitmap * @map2: second bitmap * @nbits: bit size to compare * * Return: true if first @nbits are equal, false if not */ static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2, unsigned int nbits) { if (memcmp(map1, map2, nbits / 32 * sizeof(u32))) return false; if (nbits % 32 == 0) return true; return !((map1[nbits / 32] ^ map2[nbits / 32]) & ethnl_lower_bits(nbits % 32)); } static int ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, struct nlattr **tb, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 *saved_bitmap = NULL; struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (no_mask) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); unsigned int nbytes = nwords * sizeof(u32); bool dummy; /* The bitmap size is only the size of the map part without * its mask part. */ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL); if (!saved_bitmap) return -ENOMEM; memcpy(saved_bitmap, bitmap, nbytes); ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy); } nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; unsigned int idx; if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) { NL_SET_ERR_MSG_ATTR(extack, bit_attr, "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS"); kfree(saved_bitmap); return -EINVAL; } ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) { kfree(saved_bitmap); return ret; } old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32)); if (new_val != old_val) { if (new_val) bitmap[idx / 32] |= ((u32)1 << (idx % 32)); else bitmap[idx / 32] &= ~((u32)1 << (idx % 32)); if (!no_mask) *mod = true; } } if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits)) *mod = true; kfree(saved_bitmap); return 0; } static int ethnl_compact_sanity_checks(unsigned int nbits, const struct nlattr *nest, struct nlattr **tb, struct netlink_ext_ack *extack) { bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; unsigned int attr_nbits, attr_nwords; const struct nlattr *test_attr; if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask not allowed in list bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_SIZE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing size in compact bitset"); return -EINVAL; } if (!tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing value in compact bitset"); return -EINVAL; } if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, nest, "missing mask in compact nonlist bitset"); return -EINVAL; } attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); attr_nwords = DIV_ROUND_UP(attr_nbits, 32); if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "bitset value length does not match size"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK] && nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "bitset mask length does not match size"); return -EINVAL; } if (attr_nbits <= nbits) return 0; test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] : tb[ETHTOOL_A_BITSET_MASK]; if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) { NL_SET_ERR_MSG_ATTR(extack, test_attr, "cannot modify bits past kernel bitset size"); return -EINVAL; } return 0; } /** * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap * @bitmap: bitmap to update * @nbits: size of the updated bitmap in bits * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * @mod: set this to true if bitmap is modified, leave as it is if not * * Apply bitset netsted attribute to a bitmap. If the attribute represents * a bit list, @bitmap is set to its contents; otherwise, bits in mask are * set to values from value. Bitmaps in the attribute may be longer than * @nbits but the message must not request modifying any bits past @nbits. * * Return: negative error code on failure, 0 on success */ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; unsigned int change_bits; bool no_mask; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_BITSET_BITS]) return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb, names, extack, mod); ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; change_bits = min_t(unsigned int, nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits); ethnl_bitmap32_update(bitmap, change_bits, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), no_mask ? NULL : nla_data(tb[ETHTOOL_A_BITSET_MASK]), mod); if (no_mask && change_bits < nbits) ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod); return 0; } /** * ethnl_parse_bitset() - Compute effective value and mask from bitset nest * @val: unsigned long based bitmap to put value into * @mask: unsigned long based bitmap to put mask into * @nbits: size of @val and @mask bitmaps * @attr: nest attribute to parse and apply * @names: array of bit names; may be null for compact format * @extack: extack for error reporting * * Provide @nbits size long bitmaps for value and mask so that * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x * the same way ethnl_update_bitset() with the same bitset attribute would. * * Return: negative error code on failure, 0 on success */ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(bitset_policy)]; const struct nlattr *bit_attr; bool no_mask; int rem; int ret; if (!attr) return 0; ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr, bitset_policy, extack); if (ret < 0) return ret; no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; if (!tb[ETHTOOL_A_BITSET_BITS]) { unsigned int change_bits; ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack); if (ret < 0) return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); if (change_bits > nbits) change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) bitmap_clear(val, change_bits, nbits - change_bits); if (no_mask) { bitmap_fill(mask, nbits); } else { bitmap_from_arr32(mask, nla_data(tb[ETHTOOL_A_BITSET_MASK]), change_bits); if (change_bits < nbits) bitmap_clear(mask, change_bits, nbits - change_bits); } return 0; } if (tb[ETHTOOL_A_BITSET_VALUE]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE], "value only allowed in compact bitset"); return -EINVAL; } if (tb[ETHTOOL_A_BITSET_MASK]) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK], "mask only allowed in compact bitset"); return -EINVAL; } bitmap_zero(val, nbits); if (no_mask) bitmap_fill(mask, nbits); else bitmap_zero(mask, nbits); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { unsigned int idx; bool bit_val; ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask, names, extack); if (ret < 0) return ret; if (bit_val) __set_bit(idx, val); if (!no_mask) __set_bit(idx, mask); } return 0; } #if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) /* 64-bit big endian architectures are the only case when u32 based bitmaps * and unsigned long based bitmaps have different memory layout so that we * cannot simply cast the latter to the former and need actual wrappers * converting the latter to the former. * * To reduce the number of slab allocations, the wrappers use fixed size local * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the * majority of bitmaps used by ethtool. */ #define ETHNL_SMALL_BITMAP_BITS 128 #define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32) int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS]; u32 small_val32[ETHNL_SMALL_BITMAP_WORDS]; u32 *mask32; u32 *val32; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int nwords = DIV_ROUND_UP(nbits, 32); val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL); if (!val32) return -ENOMEM; mask32 = val32 + nwords; } else { val32 = small_val32; mask32 = small_mask32; } bitmap_to_arr32(val32, val, nbits); if (mask) bitmap_to_arr32(mask32, mask, nbits); else mask32 = NULL; ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names, compact); if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(val32); return ret; } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS]; u32 *bitmap32 = small_bitmap32; bool u32_mod = false; int ret; if (nbits > ETHNL_SMALL_BITMAP_BITS) { unsigned int dst_words = DIV_ROUND_UP(nbits, 32); bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL); if (!bitmap32) return -ENOMEM; } bitmap_to_arr32(bitmap32, bitmap, nbits); ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack, &u32_mod); if (u32_mod) { bitmap_from_arr32(bitmap, bitmap32, nbits); *mod = true; } if (nbits > ETHNL_SMALL_BITMAP_BITS) kfree(bitmap32); return ret; } #else /* On little endian 64-bit and all 32-bit architectures, an unsigned long * based bitmap can be interpreted as u32 based one using a simple cast. */ int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_put_bitset(struct sk_buff *skb, int attrtype, const unsigned long *val, const unsigned long *mask, unsigned int nbits, ethnl_string_array_t names, bool compact) { return ethnl_put_bitset32(skb, attrtype, (const u32 *)val, (const u32 *)mask, nbits, names, compact); } int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits, const struct nlattr *attr, ethnl_string_array_t names, struct netlink_ext_ack *extack, bool *mod) { return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack, mod); } #endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
889 1477 707 1460 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for atomic bit * operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H #include <linux/instrumented.h> /** * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void set_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_set_bit(nr, addr); } /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). */ static __always_inline void clear_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_clear_bit(nr, addr); } /** * change_bit - Toggle a bit in memory * @nr: Bit to change * @addr: Address to start counting from * * This is a relaxed atomic operation (no implied memory barriers). * * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static __always_inline void change_bit(long nr, volatile unsigned long *addr) { instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); arch_change_bit(nr, addr); } /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_set_bit(nr, addr); } /** * test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_clear_bit(nr, addr); } /** * test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { kcsan_mb(); instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long)); return arch_test_and_change_bit(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_LOCAL64_H #define _ASM_GENERIC_LOCAL64_H #include <linux/percpu.h> #include <asm/types.h> /* * A signed long type for operations which are atomic for a single CPU. * Usually used in combination with per-cpu variables. * * This is the default implementation, which uses atomic64_t. Which is * rather pointless. The whole point behind local64_t is that some processors * can perform atomic adds and subtracts in a manner which is atomic wrt IRQs * running on this CPU. local64_t allows exploitation of such capabilities. */ /* Implement in terms of atomics. */ #if BITS_PER_LONG == 64 #include <asm/local.h> typedef struct { local_t a; } local64_t; #define LOCAL64_INIT(i) { LOCAL_INIT(i) } #define local64_read(l) local_read(&(l)->a) #define local64_set(l,i) local_set((&(l)->a),(i)) #define local64_inc(l) local_inc(&(l)->a) #define local64_dec(l) local_dec(&(l)->a) #define local64_add(i,l) local_add((i),(&(l)->a)) #define local64_sub(i,l) local_sub((i),(&(l)->a)) #define local64_sub_and_test(i, l) local_sub_and_test((i), (&(l)->a)) #define local64_dec_and_test(l) local_dec_and_test(&(l)->a) #define local64_inc_and_test(l) local_inc_and_test(&(l)->a) #define local64_add_negative(i, l) local_add_negative((i), (&(l)->a)) #define local64_add_return(i, l) local_add_return((i), (&(l)->a)) #define local64_sub_return(i, l) local_sub_return((i), (&(l)->a)) #define local64_inc_return(l) local_inc_return(&(l)->a) static inline s64 local64_cmpxchg(local64_t *l, s64 old, s64 new) { return local_cmpxchg(&l->a, old, new); } static inline bool local64_try_cmpxchg(local64_t *l, s64 *old, s64 new) { return local_try_cmpxchg(&l->a, (long *)old, new); } #define local64_xchg(l, n) local_xchg((&(l)->a), (n)) #define local64_add_unless(l, _a, u) local_add_unless((&(l)->a), (_a), (u)) #define local64_inc_not_zero(l) local_inc_not_zero(&(l)->a) /* Non-atomic variants, ie. preemption disabled and won't be touched * in interrupt, etc. Some archs can optimize this case well. */ #define __local64_inc(l) local64_set((l), local64_read(l) + 1) #define __local64_dec(l) local64_set((l), local64_read(l) - 1) #define __local64_add(i,l) local64_set((l), local64_read(l) + (i)) #define __local64_sub(i,l) local64_set((l), local64_read(l) - (i)) #else /* BITS_PER_LONG != 64 */ #include <linux/atomic.h> /* Don't use typedef: don't want them to be mixed with atomic_t's. */ typedef struct { atomic64_t a; } local64_t; #define LOCAL64_INIT(i) { ATOMIC_LONG_INIT(i) } #define local64_read(l) atomic64_read(&(l)->a) #define local64_set(l,i) atomic64_set((&(l)->a),(i)) #define local64_inc(l) atomic64_inc(&(l)->a) #define local64_dec(l) atomic64_dec(&(l)->a) #define local64_add(i,l) atomic64_add((i),(&(l)->a)) #define local64_sub(i,l) atomic64_sub((i),(&(l)->a)) #define local64_sub_and_test(i, l) atomic64_sub_and_test((i), (&(l)->a)) #define local64_dec_and_test(l) atomic64_dec_and_test(&(l)->a) #define local64_inc_and_test(l) atomic64_inc_and_test(&(l)->a) #define local64_add_negative(i, l) atomic64_add_negative((i), (&(l)->a)) #define local64_add_return(i, l) atomic64_add_return((i), (&(l)->a)) #define local64_sub_return(i, l) atomic64_sub_return((i), (&(l)->a)) #define local64_inc_return(l) atomic64_inc_return(&(l)->a) #define local64_cmpxchg(l, o, n) atomic64_cmpxchg((&(l)->a), (o), (n)) #define local64_try_cmpxchg(l, po, n) atomic64_try_cmpxchg((&(l)->a), (po), (n)) #define local64_xchg(l, n) atomic64_xchg((&(l)->a), (n)) #define local64_add_unless(l, _a, u) atomic64_add_unless((&(l)->a), (_a), (u)) #define local64_inc_not_zero(l) atomic64_inc_not_zero(&(l)->a) /* Non-atomic variants, ie. preemption disabled and won't be touched * in interrupt, etc. Some archs can optimize this case well. */ #define __local64_inc(l) local64_set((l), local64_read(l) + 1) #define __local64_dec(l) local64_set((l), local64_read(l) - 1) #define __local64_add(i,l) local64_set((l), local64_read(l) + (i)) #define __local64_sub(i,l) local64_set((l), local64_read(l) - (i)) #endif /* BITS_PER_LONG != 64 */ #endif /* _ASM_GENERIC_LOCAL64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, pid) __field(unsigned int, flags) __field(unsigned char, type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->pid = fl ? fl->c.flc_pid : 0; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, __entry->pid, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lease *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock_core *, blocker) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) __field(unsigned long, break_time) __field(unsigned long, downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->blocker = fl ? fl->c.flc_blocker : NULL; __entry->owner = fl ? fl->c.flc_owner : NULL; __entry->flags = fl ? fl->c.flc_flags : 0; __entry->type = fl ? fl->c.flc_type : 0; __entry->break_time = fl ? fl->fl_break_time : 0; __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->blocker, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type), __entry->break_time, __entry->downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lease *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, owner) __field(unsigned int, flags) __field(unsigned char, type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->owner, show_fl_flags(__entry->flags), show_fl_type(__entry->type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->c.flc_flags; __entry->l_fl_type = lease->c.flc_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->c.flc_flags; __entry->b_fl_type = breaker->c.flc_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
26 26 26 26 26 26 26 26 4 26 26 26 26 26 26 26 26 26 26 25 7 26 26 26 7 26 26 26 26 26 26 26 4 4 4 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 25 26 26 26 26 26 26 26 26 4 26 26 26 4 4 26 26 26 26 26 4 26 26 26 25 26 26 26 26 26 26 26 26 25 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 5 26 26 4 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } bool __read_mostly debug_non_panic_cpus; #ifdef CONFIG_PRINTK_CALLER static int __init debug_non_panic_cpus_setup(char *str) { debug_non_panic_cpus = true; pr_info("allow messages from non-panic CPUs in panic()\n"); return 0; } early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); module_param(debug_non_panic_cpus, bool, 0644); MODULE_PARM_DESC(debug_non_panic_cpus, "allow messages from non-panic CPUs in panic()"); #endif asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ void console_suspend_all(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void console_resume_all(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_suspend); void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; #ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE if (!console_set_on_cmdline) add_preferred_console("ttynull", 0, NULL); #endif /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); bool is_printk_cpu_sync_owner(void) { return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); } /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */
1485 1554 639 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_ALTERNATIVE_MACROS_H #define __ASM_ALTERNATIVE_MACROS_H #include <linux/const.h> #include <vdso/bits.h> #include <asm/cpucaps.h> #include <asm/insn-def.h> /* * Binutils 2.27.0 can't handle a 'UL' suffix on constants, so for the assembly * macros below we must use we must use `(1 << ARM64_CB_SHIFT)`. */ #define ARM64_CB_SHIFT 15 #define ARM64_CB_BIT BIT(ARM64_CB_SHIFT) #if ARM64_NCAPS >= ARM64_CB_BIT #error "cpucaps have overflown ARM64_CB_BIT" #endif #ifndef __ASSEMBLY__ #include <linux/stringify.h> #define ALTINSTR_ENTRY(cpucap) \ " .word 661b - .\n" /* label */ \ " .word 663f - .\n" /* new instruction */ \ " .hword " __stringify(cpucap) "\n" /* cpucap */ \ " .byte 662b-661b\n" /* source len */ \ " .byte 664f-663f\n" /* replacement len */ #define ALTINSTR_ENTRY_CB(cpucap, cb) \ " .word 661b - .\n" /* label */ \ " .word " __stringify(cb) "- .\n" /* callback */ \ " .hword " __stringify(cpucap) "\n" /* cpucap */ \ " .byte 662b-661b\n" /* source len */ \ " .byte 664f-663f\n" /* replacement len */ /* * alternative assembly primitive: * * If any of these .org directive fail, it means that insn1 and insn2 * don't have the same length. This used to be written as * * .if ((664b-663b) != (662b-661b)) * .error "Alternatives instruction length mismatch" * .endif * * but most assemblers die if insn1 or insn2 have a .inst. This should * be fixed in a binutils release posterior to 2.25.51.0.2 (anything * containing commit 4e4d08cf7399b606 or c1baaddf8861). * * Alternatives with callbacks do not generate replacement instructions. */ #define __ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, cfg_enabled) \ ".if "__stringify(cfg_enabled)" == 1\n" \ "661:\n\t" \ oldinstr "\n" \ "662:\n" \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(cpucap) \ ".popsection\n" \ ".subsection 1\n" \ "663:\n\t" \ newinstr "\n" \ "664:\n\t" \ ".org . - (664b-663b) + (662b-661b)\n\t" \ ".org . - (662b-661b) + (664b-663b)\n\t" \ ".previous\n" \ ".endif\n" #define __ALTERNATIVE_CFG_CB(oldinstr, cpucap, cfg_enabled, cb) \ ".if "__stringify(cfg_enabled)" == 1\n" \ "661:\n\t" \ oldinstr "\n" \ "662:\n" \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY_CB(cpucap, cb) \ ".popsection\n" \ "663:\n\t" \ "664:\n\t" \ ".endif\n" #define _ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, cfg, ...) \ __ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, IS_ENABLED(cfg)) #define ALTERNATIVE_CB(oldinstr, cpucap, cb) \ __ALTERNATIVE_CFG_CB(oldinstr, (1 << ARM64_CB_SHIFT) | (cpucap), 1, cb) #else #include <asm/assembler.h> .macro altinstruction_entry orig_offset alt_offset cpucap orig_len alt_len .word \orig_offset - . .word \alt_offset - . .hword (\cpucap) .byte \orig_len .byte \alt_len .endm .macro alternative_insn insn1, insn2, cap, enable = 1 .if \enable 661: \insn1 662: .pushsection .altinstructions, "a" altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f .popsection .subsection 1 663: \insn2 664: .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) .previous .endif .endm /* * Alternative sequences * * The code for the case where the capability is not present will be * assembled and linked as normal. There are no restrictions on this * code. * * The code for the case where the capability is present will be * assembled into a special section to be used for dynamic patching. * Code for that case must: * * 1. Be exactly the same length (in bytes) as the default code * sequence. * * 2. Not contain a branch target that is used outside of the * alternative sequence it is defined in (branches into an * alternative sequence are not fixed up). */ /* * Begin an alternative code sequence. */ .macro alternative_if_not cap .set .Lasm_alt_mode, 0 .pushsection .altinstructions, "a" altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f .popsection 661: .endm .macro alternative_if cap .set .Lasm_alt_mode, 1 .pushsection .altinstructions, "a" altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f .popsection .subsection 1 .align 2 /* So GAS knows label 661 is suitably aligned */ 661: .endm .macro alternative_cb cap, cb .set .Lasm_alt_mode, 0 .pushsection .altinstructions, "a" altinstruction_entry 661f, \cb, (1 << ARM64_CB_SHIFT) | \cap, 662f-661f, 0 .popsection 661: .endm /* * Provide the other half of the alternative code sequence. */ .macro alternative_else 662: .if .Lasm_alt_mode==0 .subsection 1 .else .previous .endif 663: .endm /* * Complete an alternative code sequence. */ .macro alternative_endif 664: .org . - (664b-663b) + (662b-661b) .org . - (662b-661b) + (664b-663b) .if .Lasm_alt_mode==0 .previous .endif .endm /* * Callback-based alternative epilogue */ .macro alternative_cb_end 662: .endm /* * Provides a trivial alternative or default sequence consisting solely * of NOPs. The number of NOPs is chosen automatically to match the * previous case. */ .macro alternative_else_nop_endif alternative_else nops (662b-661b) / AARCH64_INSN_SIZE alternative_endif .endm #define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) #endif /* __ASSEMBLY__ */ /* * Usage: asm(ALTERNATIVE(oldinstr, newinstr, cpucap)); * * Usage: asm(ALTERNATIVE(oldinstr, newinstr, cpucap, CONFIG_FOO)); * N.B. If CONFIG_FOO is specified, but not selected, the whole block * will be omitted, including oldinstr. */ #define ALTERNATIVE(oldinstr, newinstr, ...) \ _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) #ifndef __ASSEMBLY__ #include <linux/types.h> static __always_inline bool alternative_has_cap_likely(const unsigned long cpucap) { if (!cpucap_is_possible(cpucap)) return false; asm goto( #ifdef BUILD_VDSO ALTERNATIVE("b %l[l_no]", "nop", %[cpucap]) #else ALTERNATIVE_CB("b %l[l_no]", %[cpucap], alt_cb_patch_nops) #endif : : [cpucap] "i" (cpucap) : : l_no); return true; l_no: return false; } static __always_inline bool alternative_has_cap_unlikely(const unsigned long cpucap) { if (!cpucap_is_possible(cpucap)) return false; asm goto( ALTERNATIVE("nop", "b %l[l_yes]", %[cpucap]) : : [cpucap] "i" (cpucap) : : l_yes); return false; l_yes: return true; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_ALTERNATIVE_MACROS_H */
6 6 6 6 184 181 184 182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the MLS label from NetLabel * Copyright (C) Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { u32 index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level.cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; u32 i; int l, rc; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level.sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit( &context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; u32 i; int l; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level.sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; char default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */
1 1 1 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/netdevice.h> #include <net/netdev_lock.h> #include "dev.h" /** * dev_change_name() - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. * * Return: 0 on success, -errno on failure. */ int dev_change_name(struct net_device *dev, const char *newname) { int ret; netdev_lock_ops(dev); ret = netif_change_name(dev, newname); netdev_unlock_ops(dev); return ret; } /** * dev_set_alias() - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device. * * Return: 0 on success, -errno on failure. */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { int ret; netdev_lock_ops(dev); ret = netif_set_alias(dev, alias, len); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_set_alias); /** * dev_change_flags() - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. * * Return: 0 on success, -errno on failure. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; netdev_lock_ops(dev); ret = netif_change_flags(dev, flags, extack); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_change_flags); /** * dev_set_group() - change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { netdev_lock_ops(dev); netif_set_group(dev, new_group); netdev_unlock_ops(dev); } int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); netdev_lock_ops(dev); ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); /** * dev_change_net_namespace() - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. * * Return: 0 on success, -errno on failure. */ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { return __dev_change_net_namespace(dev, net, pat, 0, NULL); } EXPORT_SYMBOL_GPL(dev_change_net_namespace); /** * dev_change_carrier() - change device carrier * @dev: device * @new_carrier: new value * * Change device carrier * * Return: 0 on success, -errno on failure. */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { int ret; netdev_lock_ops(dev); ret = netif_change_carrier(dev, new_carrier); netdev_unlock_ops(dev); return ret; } /** * dev_change_tx_queue_len() - change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length * * Return: 0 on success, -errno on failure. */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { int ret; netdev_lock_ops(dev); ret = netif_change_tx_queue_len(dev, new_len); netdev_unlock_ops(dev); return ret; } /** * dev_change_proto_down() - set carrier according to proto_down * @dev: device * @proto_down: new value * * Return: 0 on success, -errno on failure. */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { int ret; netdev_lock_ops(dev); ret = netif_change_proto_down(dev, proto_down); netdev_unlock_ops(dev); return ret; } /** * dev_open() - prepare an interface for use * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. * * Return: 0 on success, -errno on failure. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; netdev_lock_ops(dev); ret = netif_open(dev, extack); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_open); /** * dev_close() - shutdown an interface * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { netdev_lock_ops(dev); netif_close(dev); netdev_unlock_ops(dev); } EXPORT_SYMBOL(dev_close); int dev_eth_ioctl(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; int ret = -ENODEV; if (!ops->ndo_eth_ioctl) return -EOPNOTSUPP; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_eth_ioctl(dev, ifr, cmd); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_eth_ioctl); int dev_set_mtu(struct net_device *dev, int new_mtu) { int ret; netdev_lock_ops(dev); ret = netif_set_mtu(dev, new_mtu); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_disable_lro() - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { netdev_lock_ops(dev); netif_disable_lro(dev); netdev_unlock_ops(dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_set_promiscuity() - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { int ret; netdev_lock_ops(dev); ret = netif_set_promiscuity(dev, inc); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_set_promiscuity); /** * dev_set_allmulti() - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * * Return: 0 on success, -errno on failure. */ int dev_set_allmulti(struct net_device *dev, int inc) { int ret; netdev_lock_ops(dev); ret = netif_set_allmulti(dev, inc, true); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_set_allmulti); /** * dev_set_mac_address() - change Media Access Control Address * @dev: device * @ss: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device * * Return: 0 on success, -errno on failure. */ int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; netdev_lock_ops(dev); ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL(dev_set_mac_address); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { int ret; netdev_lock_ops(dev); ret = netif_xdp_propagate(dev, bpf); netdev_unlock_ops(dev); return ret; } EXPORT_SYMBOL_GPL(dev_xdp_propagate); /** * netdev_state_change() - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { netdev_lock_ops(dev); netif_state_change(dev); netdev_unlock_ops(dev); } EXPORT_SYMBOL(netdev_state_change);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMENS_H #define _LINUX_TIMENS_H #include <linux/sched.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/err.h> #include <linux/time64.h> struct user_namespace; extern struct user_namespace init_user_ns; struct seq_file; struct vm_area_struct; struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; }; struct time_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; struct page *vvar_page; /* If set prevents changing offsets after any task joined namespace. */ bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { refcount_inc(&ns->ns.count); return ns; } struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { if (refcount_dec_and_test(&ns->ns.count)) free_time_ns(ns); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); struct proc_timens_offset { int clockid; struct timespec64 val; }; int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int n); static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->monotonic); } static inline void timens_add_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->boottime); } static inline u64 timens_add_boottime_ns(u64 nsec) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; return nsec + timespec64_to_ns(&ns_offsets->boottime); } static inline void timens_sub_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_sub(*ts, ns_offsets->boottime); } ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *offsets); static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { struct time_namespace *ns = current->nsproxy->time_ns; if (likely(ns == &init_time_ns)) return tim; return do_timens_ktime_to_host(clockid, tim, &ns->offsets); } #else static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { return 0; } static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { } static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; } static inline void put_time_ns(struct time_namespace *ns) { } static inline struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (flags & CLONE_NEWTIME) return ERR_PTR(-EINVAL); return old_ns; } static inline void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { return; } static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { return NULL; } static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } static inline u64 timens_add_boottime_ns(u64 nsec) { return nsec; } static inline void timens_sub_boottime(struct timespec64 *ts) { } static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { return tim; } #endif #endif /* _LINUX_TIMENS_H */
309 307 311 311 139 139 307 309 309 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_free_vmas() * * tlb_free_vmas() marks the start of unlinking of one or more vmas * and freeing page-tables. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / tlb_remove_page_size() * - __tlb_remove_folio_pages() / __tlb_remove_page_size() * - __tlb_remove_folio_pages_size() * * __tlb_remove_folio_pages_size() is the basic primitive that queues pages * for freeing. It will return a boolean indicating if the queue is (now) * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), * however, instead of removing a single page, assume PAGE_SIZE and remove * the given number of consecutive pages that are all part of the * same (large) folio. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef __HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor(ptdesc); tlb_remove_page(tlb, ptdesc_page(ptdesc)); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); /* * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma * in the tracked range, see tlb_free_vmas(). */ tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { /* * Anything calling __tlb_adjust_range() also sets at least one of * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || tlb->cleared_puds || tlb->cleared_p4ds)) return; tlb_flush(tlb); __tlb_reset_range(tlb); } static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { #ifdef CONFIG_MMU_GATHER_PAGE_SIZE if (tlb->page_size && tlb->page_size != page_size) { if (!tlb->fullmm && !tlb->need_flush_all) tlb_flush_mmu(tlb); } tlb->page_size = page_size; #endif } static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb) { if (tlb->cleared_ptes) return PAGE_SHIFT; if (tlb->cleared_pmds) return PMD_SHIFT; if (tlb->cleared_puds) return PUD_SHIFT; if (tlb->cleared_p4ds) return P4D_SHIFT; return PAGE_SHIFT; } static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb) { return 1UL << tlb_get_unmap_shift(tlb); } /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, * the vmas are adjusted to only cover the region to be torn down. */ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm) return; tlb_update_vma_flags(tlb, vma); #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); #endif } static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /* * Do a TLB flush and reset the range at VMA boundaries; this avoids * the ranges growing with the unused space between consecutive VMAs, * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on * this. */ tlb_flush_mmu_tlbonly(tlb); } static inline void tlb_free_vmas(struct mmu_gather *tlb) { if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the * page mapcount -- there might not be page-frames for these PFNs * after all. * * Specifically() there is a race between munmap() and * unmap_mapping_range(), where munmap() will unlink the VMA, such * that unmap_mapping_range() will no longer observe the VMA and * no-op, without observing the TLBI, returning prematurely. * * So if we're about to unlink such a VMA, and we have pending * TLBI for such a vma, flush things now. */ if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); } /* * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, * and set corresponding cleared_*. */ static inline void tlb_flush_pte_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_ptes = 1; } static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_pmds = 1; } static inline void tlb_flush_pud_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_puds = 1; } static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { __tlb_adjust_range(tlb, address, size); tlb->cleared_p4ds = 1; } #ifndef __tlb_remove_tlb_entry static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) { } #endif /** * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. * * Record the fact that pte's were really unmapped by updating the range, * so we can later optimise away the tlb invalidate. This helps when * userspace is unmapping already-unmapped pages, which happens quite a lot. */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for * later tlb invalidation. * * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple * consecutive ptes instead of only a single one. */ static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, pte_t *ptep, unsigned int nr, unsigned long address) { tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); for (;;) { __tlb_remove_tlb_entry(tlb, ptep, address); if (--nr == 0) break; ptep++; address += PAGE_SIZE; } } #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ if (_sz >= P4D_SIZE) \ tlb_flush_p4d_range(tlb, address, _sz); \ else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ else if (_sz >= PMD_SIZE) \ tlb_flush_pmd_range(tlb, address, _sz); \ else \ tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pmd_tlb_entry #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) /** * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb * invalidation. This is a nop so far, because only x86 needs it. */ #ifndef __tlb_remove_pud_tlb_entry #define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) #endif #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an * individual page had better flush the page table caches behind it. This * is definitely how x86 works, for example. And if you have an * architected non-legacy page table cache (which I'm not aware of * anybody actually doing), you're going to have some architecturally * explicit flushing for that, likely *separate* from a regular TLB entry * flush, and thus you'd need more than just some range expansion.. * * So if we ever find an architecture * that would want something that odd, I think it is up to that * architecture to do its own odd thing, not cause pain for others * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com * * For now w.r.t page table cache, mark the range_size as PAGE_SIZE */ #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ __tlb_adjust_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif #ifndef pte_needs_flush static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { return true; } #endif #ifndef huge_pmd_needs_flush static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { return true; } #endif #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */
4 245 55 75 835 258 714 94 63 65 18 33 93 245 127 296 68 57 25 25 4 76 76 5 5 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err < 0 ? err : 0; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Note that callers interested in whether wrapping has occurred should * use __xa_alloc_cyclic() instead. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err < 0 ? err : 0; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } static inline unsigned int xas_try_split_min_order(unsigned int order) { return 0; } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_sanitize_or_inline unsigned long read_word_at_a_time(const void *addr) { /* open-coded instrument_read(addr, 1) */ kasan_check_read(addr, 1); kcsan_check_read(addr, 1); /* * This load can race with concurrent stores to out-of-bounds memory, * but READ_ONCE() can't be used because it requires higher alignment * than plain loads in arm64 builds with LTO. */ return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */
245 243 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; struct madvise_behavior { int behavior; struct mmu_gather *tlb; }; /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) { switch (behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ return 1; } } #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, anon_name); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; } return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t pte, bool *any_young, bool *any_dirty) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young, any_dirty; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, &any_dirty); if (nr < folio_nr_pages(folio)) { int err; if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } if (any_young) ptent = pte_mkyoung(ptent); if (any_dirty) ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, tlb); tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single_batched( madv_behavior->tlb, vma, start, end - start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long start, unsigned long *end, int behavior) { if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct madvise_behavior *madv_behavior) { int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (start == end) return 0; if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ mmap_read_lock(mm); vma = vma_lookup(mm, start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ end = vma->vm_end; } /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ if (start == end) return 0; VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma( madv_behavior, vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } static long madvise_populate(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; int locked = 1; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { loff_t offset; int error; struct file *f; struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; return !(vma->vm_flags & disallowed); } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval) || pud_devmap(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { long err; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, start, end - start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval) || pud_devmap(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range(vma->vm_mm, start, end, &guard_remove_walk_ops, NULL); } /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, void *behavior_arg) { struct madvise_behavior *arg = behavior_arg; int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (vma->vm_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (vma->vm_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); case MADV_GUARD_INSTALL: return madvise_guard_install(vma, prev, start, end); case MADV_GUARD_REMOVE: return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); anon_vma_name_put(anon_name); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } static bool is_memory_failure(int behavior) { switch (behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: return true; default: return false; } } #else static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { return 0; } static bool is_memory_failure(int behavior) { return false; } #endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap * between the current vma and the original range. Any unmapped regions in the * original range will result in this function returning -ENOMEM while still * calling the visit function on all of the existing vmas in the range. * Must be called with the mmap_lock held for reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; for (;;) { int error; /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) break; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ tmp = vma->vm_end; if (end < tmp) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = visit(vma, &prev, start, tmp, arg); if (error) return error; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; if (start >= end) break; if (prev) vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } return unmapped_error; } #ifdef CONFIG_ANON_VMA_NAME static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, void *anon_name) { int error; /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, anon_name); /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ static int madvise_lock(struct mm_struct *mm, int behavior) { if (is_memory_failure(behavior)) return 0; if (madvise_need_mmap_write(behavior)) { if (mmap_write_lock_killable(mm)) return -EINTR; } else { mmap_read_lock(mm); } return 0; } static void madvise_unlock(struct mm_struct *mm, int behavior) { if (is_memory_failure(behavior)) return; if (madvise_need_mmap_write(behavior)) mmap_write_unlock(mm); else mmap_read_unlock(mm); } static bool madvise_batch_tlb_flush(int behavior) { switch (behavior) { case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return true; default: return false; } } static void madvise_init_tlb(struct madvise_behavior *madv_behavior, struct mm_struct *mm) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_gather_mmu(madv_behavior->tlb, mm); } static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) { if (madvise_batch_tlb_flush(madv_behavior->behavior)) tlb_finish_mmu(madv_behavior->tlb); } static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) { size_t len; if (!madvise_behavior_valid(behavior)) return false; if (!PAGE_ALIGNED(start)) return false; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return false; if (start + len < start) return false; return true; } /* * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. * @behavior: Requested madvise behavor. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the * operation. This function returns true in the cases, otherwise false. In * the former case we store an error on @err. */ static bool madvise_should_skip(unsigned long start, size_t len_in, int behavior, int *err) { if (!is_valid_madvise(start, len_in, behavior)) { *err = -EINVAL; return true; } if (start + PAGE_ALIGN(len_in) == start) { *err = 0; return true; } return false; } static bool is_madvise_populate(int behavior) { switch (behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return true; default: return false; } } static int madvise_do_behavior(struct mm_struct *mm, unsigned long start, size_t len_in, struct madvise_behavior *madv_behavior) { int behavior = madv_behavior->behavior; struct blk_plug plug; unsigned long end; int error; if (is_memory_failure(behavior)) return madvise_inject_error(behavior, start, start + len_in); start = untagged_addr_remote(mm, start); end = start + PAGE_ALIGN(len_in); blk_start_plug(&plug); if (is_madvise_populate(behavior)) error = madvise_populate(mm, start, end, behavior); else error = madvise_walk_vmas(mm, start, end, madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); return error; } /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { int error; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .behavior = behavior, .tlb = &tlb, }; if (madvise_should_skip(start, len_in, behavior, &error)) return error; error = madvise_lock(mm, behavior); if (error) return error; madvise_init_tlb(&madv_behavior, mm); error = madvise_do_behavior(mm, start, len_in, &madv_behavior); madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; struct mmu_gather tlb; struct madvise_behavior madv_behavior = { .behavior = behavior, .tlb = &tlb, }; total_len = iov_iter_count(iter); ret = madvise_lock(mm, behavior); if (ret) return ret; madvise_init_tlb(&madv_behavior, mm); while (iov_iter_count(iter)) { unsigned long start = (unsigned long)iter_iov_addr(iter); size_t len_in = iter_iov_len(iter); int error; if (madvise_should_skip(start, len_in, behavior, &error)) ret = error; else ret = madvise_do_behavior(mm, start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); ret = madvise_lock(mm, behavior); if (ret) goto out; madvise_init_tlb(&madv_behavior, mm); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; }
1198 1198 26 274 1198 1198 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/bitmap.h> #include <linux/cpumask_types.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/gfp_types.h> #include <linux/numa.h> /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) #define nr_cpu_ids ((unsigned int)NR_CPUS) #else extern unsigned int nr_cpu_ids; #endif static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); #else nr_cpu_ids = nr; #endif } /* * We have several different "preferred sizes" for the cpumask * operations, depending on operation. * * For example, the bitmap scanning and operating operations have * optimized routines that work for the single-word case, but only when * the size is constant. So if NR_CPUS fits in one single word, we are * better off using that small constant, in order to trigger the * optimized bit finding. That is 'small_cpumask_size'. * * The clearing and copying operations will similarly perform better * with a constant size, but we limit that size arbitrarily to four * words. We call this 'large_cpumask_size'. * * Finally, some operations just want the exact limit, either because * they set bits or just don't have any faster fixed-sized versions. We * call this just 'nr_cpumask_bits'. * * Note that these optional constants are always guaranteed to be at * least as big as 'nr_cpu_ids' itself is, and all our cpumask * allocations are at least that size (see cpumask_size()). The * optimization comes from being able to potentially use a compile-time * constant instead of a run-time generated exact number of CPUs. */ #if NR_CPUS <= BITS_PER_LONG #define small_cpumask_bits ((unsigned int)NR_CPUS) #define large_cpumask_bits ((unsigned int)NR_CPUS) #elif NR_CPUS <= 4*BITS_PER_LONG #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits ((unsigned int)NR_CPUS) #else #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits nr_cpu_ids #endif #define nr_cpumask_bits nr_cpu_ids /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU IDs * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_enabled_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_enabled_mask ((const struct cpumask *)&__cpu_enabled_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; extern cpumask_t cpus_booted_once_mask; static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static __always_inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, small_cpumask_bits); return cpu; } /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_zero - get the first unset cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if all cpus are set. */ static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no such cpu found. */ static __always_inline unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 * @srcp1: the first input * @srcp2: the second input * @srcp3: the third input * * Return: >= nr_cpu_ids if no cpus set in all. */ static __always_inline unsigned int cpumask_first_and_and(const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Return: >= nr_cpumask_bits if no CPUs set. */ static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_next - get the next cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set. */ static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1); } /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus unset. */ static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1); } #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } #else unsigned int cpumask_local_spread(unsigned int i, int node); unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); unsigned int cpumask_any_distribute(const struct cpumask *srcp); #endif /* NR_CPUS */ /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from * @n+1. If nothing found, wrap around and start from * the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ static __always_inline unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing * found, wrap around and start from the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src: cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *src) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding * those present in another. * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_andnot(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_andnot(cpu, mask1, mask2) \ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_or - iterate over every cpu present in either mask * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_or(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_or(cpu, mask1, mask2) \ for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask. * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_from(cpu, mask) \ for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * * Often used to find any cpu but smp_processor_id() in a mask. * If @cpu == -1, the function is equivalent to cpumask_any(). * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; return i; } /** * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function is equivalent to cpumask_any_and(). * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_and(mask1, mask2); if (i != cpu) return i; return cpumask_next_and(cpu, mask1, mask2); } /** * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function returns the first matching cpu. * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_andnot(mask1, mask2); if (i != cpu) return i; return cpumask_next_andnot(cpu, mask1, mask2); } /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and - get the Nth cpu in 2 cpumasks * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @srcp3: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_nth_and_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits, cpumask_check(cpu)); } #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Return: true if @cpu is set in @cpumask, else returns false */ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_set_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_clear_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_setall(struct cpumask *dstp) { if (small_const_nbits(small_cpumask_bits)) { cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits); return; } bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), large_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input * * Return: true if the cpumasks are equal, false if not */ static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input * * Return: true if first cpumask ORed with second cpumask == third cpumask, * otherwise false */ static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), small_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input * * Return: true if first cpumask ANDed with second cpumask is non-empty, * otherwise false */ static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Return: true if *@src1p is a subset of *@src2p, else returns false */ static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. * * Return: true if srcp is empty (has no bits set), else false */ static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. * * Return: true if srcp is full (has all bits set), else false */ static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in *srcp */ static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, small_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits); } /** * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes * * Return: size to allocate for a &struct cpumask in bytes */ static __always_inline unsigned int cpumask_size(void) { return bitmap_size(large_cpumask_bits); } #ifdef CONFIG_CPUMASK_OFFSTACK #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); } /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * See alloc_cpumask_var_node. * * Return: %true if allocation succeeded, %false if not */ static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); } void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static __always_inline void free_cpumask_var(cpumask_var_t mask) { } static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T)); /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #if NR_CPUS == 1 /* Uniprocessor: the possible/online/present masks are always "1" */ #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_possible_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) #define for_each_possible_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) #define for_each_online_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. * * Return: momentary snapshot of the number of online CPUs */ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_enabled_mask); } static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_enabled_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_dying(unsigned int cpu) { return false; } #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Return: the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } /** * cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as * hex values of cpumask * * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * The function prints the cpumask into the buffer as hex values of * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } /* * Provide a valid theoretical max size for cpumap and cpulist sysfs files * to avoid breaking userspace which may allocate a buffer based on the size * reported by e.g. fstat. * * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. * * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up * to 2 orders of magnitude larger than 8192. And then we divide by 2 to * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * * Use PAGE_SIZE as a minimum for smaller configurations while avoiding * unsigned comparison to -1. */ #define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) #endif /* __LINUX_CPUMASK_H */
924 930 925 924 930 931 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> #include <linux/pgalloc_tag.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_MEM_ALLOC_PROFILING &page_alloc_tagging_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG /* * To ensure correct allocation tagging for pages, page_ext should be available * before the first page allocation. Otherwise early task stacks will be * allocated before page_ext initialization and missing tags will be flagged. */ bool early_page_ext __meminitdata = true; #else bool early_page_ext __meminitdata; #endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) kmemleak_alloc(addr, size, 1, flags); else addr = vzalloc_node(size, nid); if (addr) memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { size_t table_size; struct page *page; table_size = page_ext_size * PAGES_PER_SECTION; memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); if (is_vmalloc_addr(addr)) { vfree(addr); } else { page = virt_to_page(addr); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_lookup() - Lookup a page extension for a PFN. * @pfn: PFN of the page we're interested in. * * Must be called with RCU read lock taken and @pfn must be valid. * * Return: NULL if no page_ext exists for this page. */ struct page_ext *page_ext_lookup(unsigned long pfn) { return lookup_page_ext(pfn_to_page(pfn)); } /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); }
2 5 2 3 16 4 2 5 2 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2020 Arm Ltd. #include <linux/arm-smccc.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <kvm/arm_hypercalls.h> #define ARM_SMCCC_TRNG_VERSION_1_0 0x10000UL /* Those values are deliberately separate from the generic SMCCC definitions. */ #define TRNG_SUCCESS 0UL #define TRNG_NOT_SUPPORTED ((unsigned long)-1) #define TRNG_INVALID_PARAMETER ((unsigned long)-2) #define TRNG_NO_ENTROPY ((unsigned long)-3) #define TRNG_MAX_BITS64 192 static const uuid_t arm_smc_trng_uuid __aligned(4) = UUID_INIT( 0x0d21e000, 0x4384, 0x11eb, 0x80, 0x70, 0x52, 0x44, 0x55, 0x4e, 0x5a, 0x4c); static int kvm_trng_do_rnd(struct kvm_vcpu *vcpu, int size) { DECLARE_BITMAP(bits, TRNG_MAX_BITS64); u32 num_bits = smccc_get_arg1(vcpu); int i; if (num_bits > 3 * size) { smccc_set_retval(vcpu, TRNG_INVALID_PARAMETER, 0, 0, 0); return 1; } /* get as many bits as we need to fulfil the request */ for (i = 0; i < DIV_ROUND_UP(num_bits, BITS_PER_LONG); i++) bits[i] = get_random_long(); bitmap_clear(bits, num_bits, TRNG_MAX_BITS64 - num_bits); if (size == 32) smccc_set_retval(vcpu, TRNG_SUCCESS, lower_32_bits(bits[1]), upper_32_bits(bits[0]), lower_32_bits(bits[0])); else smccc_set_retval(vcpu, TRNG_SUCCESS, bits[2], bits[1], bits[0]); memzero_explicit(bits, sizeof(bits)); return 1; } int kvm_trng_call(struct kvm_vcpu *vcpu) { const __le32 *u = (__le32 *)arm_smc_trng_uuid.b; u32 func_id = smccc_get_function(vcpu); unsigned long val = TRNG_NOT_SUPPORTED; int size = 64; switch (func_id) { case ARM_SMCCC_TRNG_VERSION: val = ARM_SMCCC_TRNG_VERSION_1_0; break; case ARM_SMCCC_TRNG_FEATURES: switch (smccc_get_arg1(vcpu)) { case ARM_SMCCC_TRNG_VERSION: case ARM_SMCCC_TRNG_FEATURES: case ARM_SMCCC_TRNG_GET_UUID: case ARM_SMCCC_TRNG_RND32: case ARM_SMCCC_TRNG_RND64: val = TRNG_SUCCESS; } break; case ARM_SMCCC_TRNG_GET_UUID: smccc_set_retval(vcpu, le32_to_cpu(u[0]), le32_to_cpu(u[1]), le32_to_cpu(u[2]), le32_to_cpu(u[3])); return 1; case ARM_SMCCC_TRNG_RND32: size = 32; fallthrough; case ARM_SMCCC_TRNG_RND64: return kvm_trng_do_rnd(vcpu, size); } smccc_set_retval(vcpu, val, 0, 0, 0); return 1; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MATH64_H #define _LINUX_MATH64_H #include <linux/types.h> #include <linux/math.h> #include <asm/div64.h> #include <vdso/math64.h> #if BITS_PER_LONG == 64 #define div64_long(x, y) div64_s64((x), (y)) #define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * @remainder: pointer to unsigned 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor * * This is commonly provided by 32bit archs to provide an optimized 64bit * divide. */ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * @remainder: pointer to signed 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * @remainder: pointer to unsigned 64bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Return: dividend / divisor */ static inline u64 div64_u64(u64 dividend, u64 divisor) { return dividend / divisor; } /** * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor * * Return: dividend / divisor */ static inline s64 div64_s64(s64 dividend, s64 divisor) { return dividend / divisor; } #elif BITS_PER_LONG == 32 #define div64_long(x, y) div_s64((x), (y)) #define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = do_div(dividend, divisor); return dividend; } #endif #ifndef div_s64_rem extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif #ifndef div64_u64_rem extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); #endif #ifndef div64_u64 extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 extern s64 div64_s64(s64 dividend, s64 divisor); #endif #endif /* BITS_PER_LONG */ /** * div_u64 - unsigned 64bit divide with 32bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit * divide. * * Return: dividend / divisor */ #ifndef div_u64 static inline u64 div_u64(u64 dividend, u32 divisor) { u32 remainder; return div_u64_rem(dividend, divisor, &remainder); } #endif /** * div_s64 - signed 64bit divide with 32bit divisor * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Return: dividend / divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) { s32 remainder; return div_s64_rem(dividend, divisor, &remainder); } #endif u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); #ifndef mul_u32_u32 /* * Many a GCC version messes this up and generates a 64x64 mult :-( */ static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #endif #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u64_shr */ #else #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah = a >> 32, al = a; u64 ret; ret = mul_u32_u32(al, mul) >> shift; if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } rl, rm, rn, rh, a0, b0; u64 c; a0.ll = a; b0.ll = b; rl.ll = mul_u32_u32(a0.l.low, b0.l.low); rm.ll = mul_u32_u32(a0.l.low, b0.l.high); rn.ll = mul_u32_u32(a0.l.high, b0.l.low); rh.ll = mul_u32_u32(a0.l.high, b0.l.high); /* * Each of these lines computes a 64-bit intermediate result into "c", * starting at bits 32-95. The low 32-bits go into the result of the * multiplication, the high 32-bits are carried into the next step. */ rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; rh.l.high = (c >> 32) + rh.l.high; /* * The 128-bit result of the multiplication is in rl.ll and rh.ll, * shift it right and throw away the high part of the result. */ if (shift == 0) return rl.ll; if (shift < 64) return (rl.ll >> shift) | (rh.ll << (64 - shift)); return rh.ll >> (shift & 63); } #endif /* mul_u64_u64_shr */ #endif #ifndef mul_s64_u64_shr static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift) { u64 ret; /* * Extract the sign before the multiplication and put it back * afterwards if needed. */ ret = mul_u64_u64_shr(abs(a), b, shift); if (a < 0) ret = -((s64) ret); return ret; } #endif /* mul_s64_u64_shr */ #ifndef mul_u64_u32_div static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } u, rl, rh; u.ll = a; rl.ll = mul_u32_u32(u.l.low, mul); rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high; /* Bits 32-63 of the result will be in rh.l.low. */ rl.l.high = do_div(rh.ll, divisor); /* Bits 0-31 of the result will be in rl.l.low. */ do_div(rl.ll, divisor); rl.l.high = rh.l.low; return rl.ll; } #endif /* mul_u64_u32_div */ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); /** * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up * @ll: unsigned 64bit dividend * @d: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round up. * * Return: dividend / divisor rounded up */ #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) /** * DIV_U64_ROUND_UP - unsigned 64bit divide with 32bit divisor rounded up * @ll: unsigned 64bit dividend * @d: unsigned 32bit divisor * * Divide unsigned 64bit dividend by unsigned 32bit divisor * and round up. * * Return: dividend / divisor rounded up */ #define DIV_U64_ROUND_UP(ll, d) \ ({ u32 _tmp = (d); div_u64((ll) + _tmp - 1, _tmp); }) /** * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV64_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); }) /** * DIV_U64_ROUND_CLOSEST - unsigned 64bit divide with 32bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * Divide unsigned 64bit dividend by unsigned 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u32 _tmp = (divisor); div_u64((u64)(dividend) + _tmp / 2, _tmp); }) /** * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Divide signed 64bit dividend by signed 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_S64_ROUND_CLOSEST(dividend, divisor)( \ { \ s64 __x = (dividend); \ s32 __d = (divisor); \ ((__x > 0) == (__d > 0)) ? \ div_s64((__x + (__d / 2)), __d) : \ div_s64((__x - (__d / 2)), __d); \ } \ ) /** * roundup_u64 - Round up a 64bit value to the next specified 32bit multiple * @x: the value to up * @y: 32bit multiple to round up to * * Rounds @x to the next multiple of @y. For 32bit @x values, see roundup and * the faster round_up() for powers of 2. * * Return: rounded up value. */ static inline u64 roundup_u64(u64 x, u32 y) { return DIV_U64_ROUND_UP(x, y) * y; } #endif /* _LINUX_MATH64_H */
277 269 6 22 22 20 19 9 11 20 240 14 20 218 19 7 272 15 15 76 145 36 77 163 163 244 239 2 14 104 157 157 240 14 245 238 14 245 20 242 106 157 157 239 14 243 11 4 7 11 11 11 242 289 39 272 290 290 12 305 294 101 306 78 78 78 78 245 244 245 242 5 180 230 237 14 241 242 242 242 241 181 230 181 229 9 2 2 321 322 322 323 323 324 20 20 20 20 20 11 290 289 289 290 282 3 11 14 277 20 20 20 20 6 11 11 11 3 20 9 11 20 306 275 307 34 275 273 313 287 78 31 305 305 307 305 306 307 275 78 3 3 2 307 179 183 183 307 163 232 308 306 27 150 293 5 14 147 294 308 306 307 307 307 307 306 80 79 3 78 82 82 1 80 306 81 81 81 81 306 293 81 18 306 278 81 92 76 29 51 198 22 67 44 27 84 14 84 21 65 80 85 249 194 85 85 293 293 293 293 293 293 293 293 293 293 293 4 293 293 293 293 293 293 293 293 293 293 293 293 293 293 293 293 293 293 293 202 3 75 275 2 274 34 289 18 31 290 202 201 142 142 202 52 78 236 236 235 51 198 165 107 70 41 248 133 290 290 290 290 277 248 249 77 2 78 289 288 290 77 140 290 197 139 1 2 11 11 7 4 10 10 3 3 4 7 7 81 82 79 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 // SPDX-License-Identifier: GPL-2.0-or-later /* * VMA-specific functions. */ #include "vma_internal.h" #include "vma.h" struct mmap_state { struct mm_struct *mm; struct vma_iterator *vmi; unsigned long addr; unsigned long end; pgoff_t pgoff; unsigned long pglen; unsigned long flags; struct file *file; pgprot_t page_prot; /* User-defined fields, perhaps updated by .mmap_prepare(). */ const struct vm_operations_struct *vm_ops; void *vm_private_data; unsigned long charged; struct vm_area_struct *prev; struct vm_area_struct *next; /* Unmapping state. */ struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ .file = file_, \ .page_prot = vm_get_page_prot(flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ struct vma_merge_struct name = { \ .mm = (map_)->mm, \ .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ .flags = (map_)->flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ .middle = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ } /* * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain * more than one anon_vma_chain connecting it to more than one anon_vma. A merge * would mean a wider range of folios sharing the root anon_vma lock, and thus * potential lock contention, we do not wish to encourage merging such that this * scales to a problem. */ static bool vma_had_uncowed_parents(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by anon_vma lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; /* * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; /* * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we * will remove the existing VMA's anon_vma's so there's no scalability * concerns. */ VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ if (!tgt_anon && src_anon) return !vma_had_uncowed_parents(src); /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) return !vma_had_uncowed_parents(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @vmg: The merge state that will be used to determine adjustment and VMA * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; struct vm_area_struct **remove = &vp->remove; memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; if (vmg && vmg->__remove_middle) { *remove = vmg->middle; remove = &vp->remove2; } if (vmg && vmg->__remove_next) *remove = vmg->next; if (vmg && vmg->__adjust_middle_start) adjust = vmg->middle; else if (vmg && vmg->__adjust_next_start) adjust = vmg->next; else adjust = NULL; vp->adj_next = adjust; if (!vp->anon_vma && adjust) vp->anon_vma = adjust->anon_vma; VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; if (vmg && vmg->skip_vma_uprobe) vp->skip_vma_uprobe = true; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store_new(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); if (!vp->skip_vma_uprobe) { uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } } if (vp->remove) { again: vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL); } /* * Can the proposed VMA be merged with the left (previous) VMA taking into * account the start position of the proposed range. */ static bool can_vma_merge_left(struct vma_merge_struct *vmg) { return vmg->prev && vmg->prev->vm_end == vmg->start && can_vma_merge_after(vmg); } /* * Can the proposed VMA be merged with the right (next) VMA taking into * account the end position of the proposed range. * * In addition, if we can merge with the left VMA, ensure that left and right * anon_vma's are also compatible. */ static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { struct vm_area_struct *next = vmg->next; struct vm_area_struct *prev; if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) return true; /* * If we can merge with prev (left) and next (right), indicating that * each VMA's anon_vma is compatible with the proposed anon_vma, this * does not mean prev and next are compatible with EACH OTHER. * * We therefore check this in addition to mergeability to either side. */ prev = vmg->prev; return !prev->anon_vma || !next->anon_vma || prev->anon_vma == next->anon_vma; } /* * Close a vm structure and free it. */ void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; int err; WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) return err; } new = vm_area_dup(vma); if (!new) return -ENOMEM; if (new_below) { new->vm_end = addr; } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = -ENOMEM; vma_iter_config(vmi, new->vm_start, new->vm_end); if (vma_iter_prealloc(vmi, new)) goto out_free_vma; err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; if (new->vm_file) get_file(new->vm_file); if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); vma_start_write(vma); vma_start_write(new); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); /* * Get rid of huge pages and shared page tables straddling the split * boundary. */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); if (is_vm_hugetlb_page(vma)) hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; } else { vma->vm_end = addr; } /* vma_complete stores the new vma */ vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); /* Success. */ if (new_below) vma_next(vmi); else vma_prev(vmi); return 0; out_free_mpol: mpol_put(vma_policy(new)); out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); return err; } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); } /* * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the * instance that the destination VMA has no anon_vma but the source does. * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* * There are three cases to consider for correctly propagating * anon_vma's on merge. * * The first is trivial - neither VMA has anon_vma, we need not do * anything. * * The second where both have anon_vma is also a no-op, as they must * then be the same, so there is simply nothing to copy. * * Here we cover the third - if the destination VMA has no anon_vma, * that is it is unfaulted, we need to ensure that the newly merged * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); if (ret) return ret; *dup = dst; } return 0; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif /* Check for a infinite loop */ if (++i > mm->map_count + 10) { i = -1; break; } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * Based on the vmg flag indicating whether we need to adjust the vm_start field * for the middle or next VMA, we calculate what the range of the newly adjusted * VMA ought to be, and set the VMA's range accordingly. */ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; pgoff_t pgoff; if (vmg->__adjust_middle_start) { adjust = vmg->middle; pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); } else if (vmg->__adjust_next_start) { adjust = vmg->next; pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); } else { return; } vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); } /* * Actually perform the VMA merge operation. * * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not * modify any VMAs or cause inconsistent state should an OOM condition arise. * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) { struct vm_area_struct *vma; struct vma_prepare vp; if (vmg->__adjust_next_start) { /* We manipulate middle and adjust next, which is the target. */ vma = vmg->middle; vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { vma = vmg->target; /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } init_multi_vma_prep(&vp, vma, vmg); /* * If vmg->give_up_on_oom is set, we're safe, because we don't actually * manipulate any VMAs until we succeed at preallocation. * * Past this point, we will not return an error. */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; vma_prepare(&vp); /* * THP pages may need to do additional splits if we increase * middle->vm_start. */ vma_adjust_trans_huge(vma, vmg->start, vmg->end, vmg->__adjust_middle_start ? vmg->middle : NULL); vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); vmg_adjust_set_range(vmg); vma_iter_store_overwrite(vmg->vmi, vmg->target); vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ static bool can_merge_remove_vma(struct vm_area_struct *vma) { return !vma->vm_ops || !vma->vm_ops->close; } /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. * * @vmg: Describes the modifications being made to a VMA and associated * metadata. * * When the attributes of a range within a VMA change, then it might be possible * for immediately adjacent VMAs to be merged into that VMA due to having * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates * the maple tree describing the @vmg->middle->vm_mm address space to account * for this, as well as any VMAs shrunk/expanded/deleted as a result of this * merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent * calls to this function should reset these fields. * * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: * - The caller must assign the VMA to be modifed to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). */ static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; bool left_side = middle && start == middle->vm_start; bool right_side = middle && end == middle->vm_end; int err = 0; bool merge_left, merge_right, merge_both; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); VM_WARN_ON_VMG(start >= end, vmg); /* * If middle == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ VM_WARN_ON_VMG(middle && ((middle != prev && vmg->start != middle->vm_start) || vmg->end > middle->vm_end), vmg); /* The vmi must be positioned within vmg->middle. */ VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; /* * If a special mapping or if the range being modified is neither at the * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) merge_left = can_vma_merge_left(vmg); else merge_left = false; if (right_side) { next = vmg->next = vma_iter_next_range(vmg->vmi); vma_iter_prev_range(vmg->vmi); merge_right = can_vma_merge_right(vmg, merge_left); } else { merge_right = false; next = NULL; } if (merge_left) /* If merging prev, position iterator there. */ vma_prev(vmg->vmi); else if (!merge_right) /* If we have nothing to merge, abort. */ return NULL; merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ vmg->__remove_middle = left_side && right_side; /* * If we need to remove middle in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ if (vmg->__remove_middle && !can_merge_remove_vma(middle)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ vmg->__remove_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging * prev and middle (thereby deleting middle). */ if (vmg->__remove_next && !can_merge_remove_vma(next)) { vmg->__remove_next = false; merge_right = false; merge_both = false; } /* No matter what happens, we will be adjusting middle. */ vma_start_write(middle); if (merge_right) { vma_start_write(next); vmg->target = next; } if (merge_left) { vma_start_write(prev); vmg->target = prev; } if (merge_both) { /* * |<-------------------->| * |-------********-------| * prev middle next * extend delete delete */ vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of * next or middle contains the anon_vma we must duplicate. */ err = dup_anon_vma(prev, next->anon_vma ? next : middle, &anon_dup); } else if (merge_left) { /* * |<------------>| OR * |<----------------->| * |-------************* * prev middle * extend shrink/delete */ vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; if (!vmg->__remove_middle) vmg->__adjust_middle_start = true; err = dup_anon_vma(prev, middle, &anon_dup); } else { /* merge_right */ /* * |<------------->| OR * |<----------------->| * *************-------| * middle next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be middle. */ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); if (vmg->__remove_middle) { vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { /* We shrink middle and expand next. */ vmg->__adjust_next_start = true; vmg->start = middle->vm_start; vmg->end = start; vmg->pgoff = middle->vm_pgoff; } err = dup_anon_vma(next, middle, &anon_dup); } if (err || commit_merge(vmg)) goto abort; khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); if (anon_dup) unlink_anon_vmas(anon_dup); /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the * user doesn't want this reported and instead just wants to give up on * the merge, allow it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } /* * vma_merge_new_range - Attempt to merge a new VMA into address space * * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end * (exclusive), which we try to merge with any adjacent VMAs if possible. * * We are about to add a VMA to the address space starting at @vmg->start and * ending at @vmg->end. There are three different possible scenarios: * * 1. There is a VMA with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) either before or after it - * EXPAND that VMA: * * Proposed: |-----| or |-----| * Existing: |----| |----| * * 2. There are VMAs with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - * EXPAND the former and REMOVE the latter: * * Proposed: |-----| * Existing: |----| |----| * * 3. There are no VMAs immediately adjacent to the proposed new VMA or those * VMAs do not have identical attributes - NO MERGE POSSIBLE. * * In instances where we can merge, this function returns the expanded VMA which * will have its range adjusted accordingly and the underlying maple tree also * adjusted. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * This function adjusts @vmg to provide @vmg->next if not already specified, * and adjusts [@vmg->start, @vmg->end) to span the expanded range. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have determined that [@vmg->start, @vmg->end) is empty, other than VMAs that will be unmapped should the operation succeed. * - The caller must have specified the previous vma in @vmg->prev. * - The caller must have specified the next vma in @vmg->next. * - The caller must have positioned the vmi at or before the gap. */ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; vmg->middle = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; vmg->middle = prev; vmg->pgoff = prev->vm_pgoff; /* * If this merge would result in removal of the next VMA but we * are not permitted to do so, reduce the operation to merging * prev and vma. */ if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; /* In expand-only case we are already positioned at prev. */ if (!vmg->just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } } /* * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ if (vmg->middle && !vma_expand(vmg)) { khugepaged_enter_vma(vmg->middle, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->middle; } return NULL; } /* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. * Will expand over vmg->next if it's different from vmg->middle and vmg->end == * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. * - The caller must have set @vmg->middle and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *next = vmg->next; mmap_assert_write_locked(vmg->mm); vma_start_write(middle); if (next && (middle != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); /* * In this case we don't report OOM, so vmg->give_up_on_mm is * safe. */ ret = dup_anon_vma(middle, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != middle && vmg->end > next->vm_start, vmg); /* Only handles expanding */ VM_WARN_ON_VMG(middle->vm_start < vmg->start || middle->vm_end > vmg->end, vmg); vmg->target = middle; if (remove_next) vmg->__remove_next = true; if (commit_merge(vmg)) goto nomem; return 0; nomem: if (anon_dup) unlink_anon_vmas(anon_dup); /* * If the user requests that we just give upon OOM, we are safe to do so * here, as commit merge provides this contract to us. Nothing has been * changed - no harm no foul, just don't report it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); return 0; } static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; if (!vms->clear_ptes) /* Nothing to do */ return; /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_close(vma); } /* * vms_complete_munmap_vmas() - Finish the munmap() operation * @vms: The vma munmap struct * @mas_detach: The maple state of the detached vmas * * This updates the mm_struct, unmaps the region, frees the resources * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; struct mm_struct *mm; mm = current->mm; mm->map_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); /* Paranoid bookkeeping */ VM_WARN_ON(vms->exec_vm > mm->exec_vm); VM_WARN_ON(vms->stack_vm > mm->stack_vm); VM_WARN_ON(vms->data_vm > mm->data_vm); mm->exec_vm -= vms->exec_vm; mm->stack_vm -= vms->stack_vm; mm->data_vm -= vms->data_vm; /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); if (vms->unlock) mmap_read_unlock(mm); __mt_destroy(mas_detach->tree); } /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary * and marking the vmas as isolated. * * @vms: The vma munmap struct * @mas_detach: The maple state tracking the detached tree * * Return: 0 on success, error otherwise */ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; int error; /* * If we need to split any vma, do it now to save pain later. * Does it split the first one? */ if (vms->start > vms->vma->vm_start) { /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && vms->vma->vm_mm->map_count >= sysctl_max_map_count) { error = -ENOMEM; goto map_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ if (!can_modify_vma(vms->vma)) { error = -EPERM; goto start_split_failed; } error = __split_vma(vms->vmi, vms->vma, vms->start, 1); if (error) goto start_split_failed; } vms->prev = vma_prev(vms->vmi); if (vms->prev) vms->unmap_start = vms->prev->vm_end; /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; if (!can_modify_vma(next)) { error = -EPERM; goto modify_vma_failed; } /* Does it split the end? */ if (next->vm_end > vms->end) { error = __split_vma(vms->vmi, next, vms->end, 0); if (error) goto end_split_failed; } vma_start_write(next); mas_set(mas_detach, vms->vma_count++); error = mas_store_gfp(mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; if (next->vm_flags & VM_LOCKED) vms->locked_vm += nrpages; if (next->vm_flags & VM_ACCOUNT) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(next, vms->start, vms->end, vms->uf); if (error) goto userfaultfd_error; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < vms->start); BUG_ON(next->vm_start > vms->end); #endif } vms->next = vma_next(vms->vmi); if (vms->next) vms->unmap_end = vms->next->vm_start; #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { MA_STATE(test, mas_detach->tree, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vms->vmi, vms->start); rcu_read_lock(); vma_test = mas_find(&test, vms->vma_count - 1); for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, vms->vma_count - 1); } rcu_read_unlock(); BUG_ON(vms->vma_count != test_count); } #endif while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); vms->clear_ptes = true; return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: map_count_exceeded: return error; } /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { struct maple_tree mt_detach; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); struct vma_munmap_struct vms; int error; init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); error = vms_gather_munmap_vmas(&vms, &mas_detach); if (error) goto gather_failed; error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; /* Point of no return */ vms_complete_munmap_vmas(&vms, &mas_detach); return 0; clear_tree_failed: reattach_vmas(&mas_detach); gather_failed: validate_mm(mm); return error; } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { unsigned long end; struct vm_area_struct *vma; if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; end = start + PAGE_ALIGN(len); if (end == start) return -EINVAL; /* Find the first overlapping VMA */ vma = vma_find(vmi, end); if (!vma) { if (unlock) mmap_write_unlock(mm); return 0; } return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->middle; unsigned long start = vmg->start; unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); /* * Split can fail for reasons other than OOM, so if the user requests * this it's probably a mistake. */ VM_WARN_ON(vmg->give_up_on_oom && (vma->vm_start != start || vma->vm_end != end)); /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ if (vma->vm_end > end) { int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); } return vma; } struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.anon_name = new_name; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.policy = new_pol; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; return vma_modify(&vmg); } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { vb->count = 0; } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { struct address_space *mapping; int i; mapping = vb->vmas[0]->vm_file->f_mapping; i_mmap_lock_write(mapping); for (i = 0; i < vb->count; i++) { VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); __remove_shared_vm_struct(vb->vmas[i], mapping); } i_mmap_unlock_write(mapping); unlink_file_vma_batch_init(vb); } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { if (vma->vm_file == NULL) return; if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || vb->count == ARRAY_SIZE(vb->vmas)) unlink_file_vma_batch_process(vb); vb->vmas[vb->count] = vma; vb->count++; } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { if (vb->count > 0) unlink_file_vma_batch_process(vb); } /* * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } void vma_link_file(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct address_space *mapping; if (file) { mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } } int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store_new(&vmi, vma); vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } /* * If the VMA we are copying might contain a uprobe PTE, ensure * that we do not establish one upon merge. Otherwise, when mremap() * moves page tables, it will orphan the newly created PTE. */ if (vma->vm_file) vmg.skip_vma_uprobe = true; new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ vmg.middle = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_new_range(&vmg); if (new_vma) { /* * Source vma may have been merged into new_vma */ if (unlikely(vma_start >= new_vma->vm_start && vma_start < new_vma->vm_end)) { /* * The only way we can get a vma_merge with * self during an mremap is if the vma hasn't * been faulted in yet and we were allowed to * reset the dst vma->vm_pgoff to the * destination address of the mremap to allow * the merge to happen. mremap must change the * vm_pgoff linearity between src and dst vmas * (in turn preventing a vma_merge) to be * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = vm_area_dup(vma); if (!new_vma) goto out; vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } return new_vma; out_vma_link: fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: return NULL; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change from under us because * we hold the mm_all_locks_mutex. * * Operations on ->flags have to be atomic because * even if AS_MM_ALL_LOCKS is stable thanks to the * mm_all_locks_mutex, there may be other cpus * changing other bitflags in parallel to us. */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); /* * vma_start_write() does not have a complement in mm_drop_all_locks() * because vma_start_write() is always asymmetrical; it marks a VMA as * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_lock_anon_vma(mm, avc->anon_vma); } return 0; out_unlock: mm_drop_all_locks(mm); return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } } static void vm_unlock_mapping(struct address_space *mapping) { if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); } } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } mutex_unlock(&mm_all_locks_mutex); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } /* * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * * @map: Mapping state. * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; struct vma_munmap_struct *vms = &map->vms; /* Find the first overlapping VMA and initialise unmap state. */ vms->vma = vma_find(vmi, map->end); init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, /* unlock = */ false); /* OK, we have overlapping VMAs - prepare to unmap them. */ if (vms->vma) { mt_init_flags(&map->mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(map->mt_detach); mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ error = vms_gather_munmap_vmas(vms, &map->mas_detach); if (error) { /* On error VMAs will already have been reattached. */ vms->nr_pages = 0; return error; } map->next = vms->next; map->prev = vms->prev; } else { map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ if (accountable_mapping(map->file, map->flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) return error; } vms->nr_accounted = 0; map->flags |= VM_ACCOUNT; } /* * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ vms_clean_up_area(vms, &map->mas_detach); return 0; } static int __mmap_new_file_vma(struct mmap_state *map, struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; int error; vma->vm_file = get_file(map->file); if (!map->file->f_op->mmap) return 0; error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, map->prev, map->next); return error; } /* Drivers cannot alter the address of the VMA. */ WARN_ON_ONCE(map->addr != vma->vm_start); /* * Drivers should not permit writability when previously it was * disallowed. */ VM_WARN_ON_ONCE(map->flags != vma->vm_flags && !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); map->flags = vma->vm_flags; return 0; } /* * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * * Returns: Zero on success, or an error. */ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; struct vm_area_struct *vma; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(map->mm); if (!vma) return -ENOMEM; vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } if (map->file) error = __mmap_new_file_vma(map, vma); else if (map->flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); if (error) goto free_iter_vma; #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; vma_link_file(vma); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; free_iter_vma: vma_iter_free(vmi); free_vma: vm_area_free(vma); return error; } /* * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping * statistics, handle locking and finalise the VMA. * * @map: Mapping state. * @vma: Merged or newly allocated VMA for the mmap()'d region. */ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; unsigned long vm_flags = vma->vm_flags; perf_event_mmap(vma); /* Unmap any existing mapping in the area. */ vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; } if (vma->vm_file) uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. * * This is called prior to any merge attempt, and updates whitelisted fields * that are permitted to be updated by the caller. * * All but user-defined fields will be pre-populated with original values. * * Returns 0 on success, or an error code otherwise. */ static int call_mmap_prepare(struct mmap_state *map) { int err; struct vm_area_desc desc = { .mm = map->mm, .start = map->addr, .end = map->end, .pgoff = map->pgoff, .file = map->file, .vm_flags = map->flags, .page_prot = map->page_prot, }; /* Invoke the hook. */ err = __call_mmap_prepare(map->file, &desc); if (err) return err; /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; map->file = desc.file; map->flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ map->vm_ops = desc.vm_ops; map->vm_private_data = desc.private_data; return 0; } static void set_vma_user_defined_fields(struct vm_area_struct *vma, struct mmap_state *map) { if (map->vm_ops) vma->vm_ops = map->vm_ops; vma->vm_private_data = map->vm_private_data; } static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); error = __mmap_prepare(&map, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map); if (error) goto abort_munmap; /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); vma = vma_merge_new_range(&vmg); } /* ...but if we can't, allocate a new VMA. */ if (!vma) { error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; } if (have_mmap_prepare) set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); return addr; /* Accounting was done by __mmap_prepare(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } /** * mmap_region() - Actually perform the userland mapping of a VMA into * current->mm with known, aligned and overflow-checked @addr and @len, and * correctly determined VMA flags @vm_flags and page offset @pgoff. * * This is an internal memory management function, and should not be used * directly. * * The caller must write-lock current->mm->mmap_lock. * * @file: If a file-backed mapping, a pointer to the struct file describing the * file to be mapped, otherwise NULL. * @addr: The page-aligned address at which to perform the mapping. * @len: The page-aligned, non-zero, length of the mapping. * @vm_flags: The VMA flags which should be applied to the mapping. * @pgoff: If @file is specified, the page offset into the file, if not then * the virtual page offset in memory of the anonymous mapping. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap * events. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ if (map_deny_write_exec(vm_flags, vm_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ if (!arch_validate_flags(vm_flags)) return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ if (file && is_shared_maywrite(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) return error; writable_file_mapping = true; } ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); validate_mm(current->mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ vmg.just_expand = true; if (vma_merge_new_range(&vmg)) goto out; else if (vmg_nomem(&vmg)) goto unacct_fail; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; /* * Adjust for the gap first so it doesn't interfere with the * later alignment. The first step is the minimum needed to * fulill the start gap, the next steps is the minimum to align * that. It is the minimum needed to fulill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); vma_iter_reset(&vmi); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; vma_iter_reset(&vmi); goto retry; } } return gap; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; mmap_assert_write_locked(mm); /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; mmap_assert_write_locked(mm); address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } if (prev) vma_iter_next_range_limit(&vmi, vma->vm_start); vma_iter_config(&vmi, address, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { unsigned long size, grow; size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; error = -ENOMEM; if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; }
212 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM capability #if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CAPABILITY_H #include <linux/cred.h> #include <linux/tracepoint.h> #include <linux/user_namespace.h> /** * cap_capable - called after it's determined if a task has a particular * effective capability * * @cred: The credentials used * @target_ns: The user namespace of the resource being accessed * @capable_ns: The user namespace in which the credential provides the * capability to access the targeted resource. * This will be NULL if ret is not 0. * @cap: The capability to check for * @ret: The return value of the check: 0 if it does, -ve if it does not * * Allows to trace calls to cap_capable in commoncap.c */ TRACE_EVENT(cap_capable, TP_PROTO(const struct cred *cred, struct user_namespace *target_ns, const struct user_namespace *capable_ns, int cap, int ret), TP_ARGS(cred, target_ns, capable_ns, cap, ret), TP_STRUCT__entry( __field(const struct cred *, cred) __field(struct user_namespace *, target_ns) __field(const struct user_namespace *, capable_ns) __field(int, cap) __field(int, ret) ), TP_fast_assign( __entry->cred = cred; __entry->target_ns = target_ns; __entry->capable_ns = ret == 0 ? capable_ns : NULL; __entry->cap = cap; __entry->ret = ret; ), TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d", __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap, __entry->ret) ); #endif /* _TRACE_CAPABILITY_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 // SPDX-License-Identifier: GPL-2.0-or-later /* * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <linux/unaligned.h> #include <crypto/chacha.h> static void chacha_permute(struct chacha_state *state, int nrounds) { u32 *x = state->x; int i; /* whitelist the allowed round counts */ WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } } /** * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix * @out: output keystream block * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ void chacha_block_generic(struct chacha_state *state, u8 out[CHACHA_BLOCK_SIZE], int nrounds) { struct chacha_state permuted_state = *state; int i; chacha_permute(&permuted_state, nrounds); for (i = 0; i < ARRAY_SIZE(state->x); i++) put_unaligned_le32(permuted_state.x[i] + state->x[i], &out[i * sizeof(u32)]); state->x[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix * @out: the output words * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds) { struct chacha_state permuted_state = *state; chacha_permute(&permuted_state, nrounds); memcpy(&out[0], &permuted_state.x[0], 16); memcpy(&out[4], &permuted_state.x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { free_pgd_range(tlb, addr, end, floor, ceiling); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return 0; } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */
21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
293 270 257 354 156 257 155 257 293 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google LLC * Author: Will Deacon <will@kernel.org> */ #ifndef __ARM64_KVM_PGTABLE_H__ #define __ARM64_KVM_PGTABLE_H__ #include <linux/bits.h> #include <linux/kvm_host.h> #include <linux/types.h> #define KVM_PGTABLE_FIRST_LEVEL -1 #define KVM_PGTABLE_LAST_LEVEL 3 /* * The largest supported block sizes for KVM (no 52-bit PA support): * - 4K (level 1): 1GB * - 16K (level 2): 32MB * - 64K (level 2): 512MB */ #ifdef CONFIG_ARM64_4K_PAGES #define KVM_PGTABLE_MIN_BLOCK_LEVEL 1 #else #define KVM_PGTABLE_MIN_BLOCK_LEVEL 2 #endif #define kvm_lpa2_is_enabled() system_supports_lpa2() static inline u64 kvm_get_parange_max(void) { if (kvm_lpa2_is_enabled() || (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16)) return ID_AA64MMFR0_EL1_PARANGE_52; else return ID_AA64MMFR0_EL1_PARANGE_48; } static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange_max = kvm_get_parange_max(); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); if (parange > parange_max) parange = parange_max; return parange; } typedef u64 kvm_pte_t; #define KVM_PTE_VALID BIT(0) #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) #define KVM_PTE_ADDR_MASK_LPA2 GENMASK(49, PAGE_SHIFT) #define KVM_PTE_ADDR_51_50_LPA2 GENMASK(9, 8) #define KVM_PHYS_INVALID (-1ULL) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 /* * Used to indicate a pte for which a 'break-before-make' sequence is in * progress. */ #define KVM_INVALID_PTE_LOCKED BIT(10) static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; } static inline u64 kvm_pte_to_phys(kvm_pte_t pte) { u64 pa; if (kvm_lpa2_is_enabled()) { pa = pte & KVM_PTE_ADDR_MASK_LPA2; pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50; } else { pa = pte & KVM_PTE_ADDR_MASK; if (PAGE_SHIFT == 16) pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; } return pa; } static inline kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte; if (kvm_lpa2_is_enabled()) { pte = pa & KVM_PTE_ADDR_MASK_LPA2; pa &= GENMASK(51, 50); pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50); } else { pte = pa & KVM_PTE_ADDR_MASK; if (PAGE_SHIFT == 16) { pa &= GENMASK(51, 48); pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); } } return pte; } static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) { return __phys_to_pfn(kvm_pte_to_phys(pte)); } static inline u64 kvm_granule_shift(s8 level) { /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); } static inline u64 kvm_granule_size(s8 level) { return BIT(kvm_granule_shift(level)); } static inline bool kvm_level_supports_block_mapping(s8 level) { return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } static inline u32 kvm_supported_block_sizes(void) { s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; u32 r = 0; for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) r |= BIT(kvm_granule_shift(level)); return r; } static inline bool kvm_is_block_size_supported(u64 size) { bool is_power_of_two = IS_ALIGNED(size, size); return is_power_of_two && (size & kvm_supported_block_sizes()); } /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. * The @arg parameter can be used by the walker * to pass a memcache. The initial refcount of * the page is 1. * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. * The @size parameter is in bytes, and is rounded * up to the next page boundary. The resulting * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. * @free_unlinked_table: Free an unlinked paging structure by unlinking and * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the * refcount reaches 0 the page is automatically * freed. * @page_count: Return the refcount of a page. * @phys_to_virt: Convert a physical address into a virtual * address mapped in the current context. * @virt_to_phys: Convert a virtual address mapped in the current * context into a physical address. * @dcache_clean_inval_poc: Clean and invalidate the data cache to the PoC * for the specified memory address range. * @icache_inval_pou: Invalidate the instruction cache to the PoU * for the specified memory address range. */ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); void (*free_unlinked_table)(void *addr, s8 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); void* (*phys_to_virt)(phys_addr_t phys); phys_addr_t (*virt_to_phys)(void *addr); void (*dcache_clean_inval_poc)(void *addr, size_t size); void (*icache_inval_pou)(void *addr, size_t size); }; /** * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags. * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have * ARM64_HAS_STAGE2_FWB. * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. */ enum kvm_pgtable_stage2_flags { KVM_PGTABLE_S2_NOFWB = BIT(0), KVM_PGTABLE_S2_IDMAP = BIT(1), }; /** * enum kvm_pgtable_prot - Page-table permissions and attributes. * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. * @KVM_PGTABLE_PROT_DEVICE: Device attributes. * @KVM_PGTABLE_PROT_NORMAL_NC: Normal noncacheable attributes. * @KVM_PGTABLE_PROT_SW0: Software bit 0. * @KVM_PGTABLE_PROT_SW1: Software bit 1. * @KVM_PGTABLE_PROT_SW2: Software bit 2. * @KVM_PGTABLE_PROT_SW3: Software bit 3. */ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_X = BIT(0), KVM_PGTABLE_PROT_W = BIT(1), KVM_PGTABLE_PROT_R = BIT(2), KVM_PGTABLE_PROT_DEVICE = BIT(3), KVM_PGTABLE_PROT_NORMAL_NC = BIT(4), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), KVM_PGTABLE_PROT_SW2 = BIT(57), KVM_PGTABLE_PROT_SW3 = BIT(58), }; #define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) #define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) #define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX #define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid * entries. * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their * children. * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their * children. * @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared * with other software walkers. * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was * invoked from a fault handler. * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries * without Break-before-make's * TLB invalidation. * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries * without Cache maintenance * operations required. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), KVM_PGTABLE_WALK_TABLE_POST = BIT(2), KVM_PGTABLE_WALK_SHARED = BIT(3), KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5), KVM_PGTABLE_WALK_SKIP_CMO = BIT(6), }; struct kvm_pgtable_visit_ctx { kvm_pte_t *ptep; kvm_pte_t old; void *arg; struct kvm_pgtable_mm_ops *mm_ops; u64 start; u64 addr; u64 end; s8 level; enum kvm_pgtable_walk_flags flags; }; typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit); static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx) { return ctx->flags & KVM_PGTABLE_WALK_SHARED; } /** * struct kvm_pgtable_walker - Hook into a page-table walk. * @cb: Callback function to invoke during the walk. * @arg: Argument passed to the callback function. * @flags: Bitwise-OR of flags to identify the entry types on which to * invoke the callback function. */ struct kvm_pgtable_walker { const kvm_pgtable_visitor_fn_t cb; void * const arg; const enum kvm_pgtable_walk_flags flags; }; /* * RCU cannot be used in a non-kernel context such as the hyp. As such, page * table walkers used in hyp do not call into RCU and instead use other * synchronization mechanisms (such as a spinlock). */ #if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) typedef kvm_pte_t *kvm_pteref_t; static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, kvm_pteref_t pteref) { return pteref; } static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* * Due to the lack of RCU (or a similar protection scheme), only * non-shared table walkers are allowed in the hypervisor. */ if (walker->flags & KVM_PGTABLE_WALK_SHARED) return -EPERM; return 0; } static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {} static inline bool kvm_pgtable_walk_lock_held(void) { return true; } #else typedef kvm_pte_t __rcu *kvm_pteref_t; static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker, kvm_pteref_t pteref) { return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) rcu_read_lock(); return 0; } static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) rcu_read_unlock(); } static inline bool kvm_pgtable_walk_lock_held(void) { return rcu_read_lock_held(); } #endif /** * struct kvm_pgtable - KVM page-table. * @ia_bits: Maximum input address size, in bits. * @start_level: Level at which the page-table walk starts. * @pgd: Pointer to the first top-level entry of the page-table. * @mm_ops: Memory management callbacks. * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. * @flags: Stage-2 page-table flags. * @force_pte_cb: Function that returns true if page level mappings must * be used instead of block mappings. */ struct kvm_pgtable { union { struct rb_root_cached pkvm_mappings; struct { u32 ia_bits; s8 start_level; kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; /* Stage-2 only */ enum kvm_pgtable_stage2_flags flags; kvm_pgtable_force_pte_cb_t force_pte_cb; }; }; struct kvm_s2_mmu *mmu; }; /** * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. * @pgt: Uninitialised page-table structure to initialise. * @va_bits: Maximum virtual address bits. * @mm_ops: Memory management callbacks. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops); /** * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); /** * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * @addr: Virtual address at which to place the mapping. * @size: Size of the mapping. * @phys: Physical address of the memory to map. * @prot: Permissions and attributes for the mapping. * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. Attempts to install a new mapping * for a virtual address that is already mapped will be rejected with an * error and a WARN(). * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot); /** * kvm_pgtable_hyp_unmap() - Remove a mapping from a hypervisor stage-1 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). * @addr: Virtual address from which to remove the mapping. * @size: Size of the mapping. * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * TLB invalidation is performed for each page-table entry cleared during the * unmapping operation and the reference count for the page-table page * containing the cleared entry is decremented, with unreferenced pages being * freed. The unmapping operation will stop early if it encounters either an * invalid page-table entry or a valid block mapping which maps beyond the range * being unmapped. * * Return: Number of bytes unmapped, which may be 0. */ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_get_vtcr() - Helper to construct VTCR_EL2 * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register. * @mmfr1: Sanitized value of SYS_ID_AA64MMFR1_EL1 register. * @phys_shfit: Value to set in VTCR_EL2.T0SZ. * * The VTCR value is common across all the physical CPUs on the system. * We use system wide sanitised values to fill in different fields, * except for Hardware Management of Access Flags. HA Flag is set * unconditionally on all CPUs, as it is safe to run with or without * the feature and the bit is RES0 on CPUs that don't support it. * * Return: VTCR_EL2 value */ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD * @vtcr: Content of the VTCR register. * * Return: the size (in bytes) of the stage-2 PGD */ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr); /** * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @mmu: S2 MMU context for this S2 translation * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. * @force_pte_cb: Function that returns true if page level mappings must * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb); static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { return __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL); } /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * * The page-table is assumed to be unreachable by any hardware walkers prior * to freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. * @pgtable: Unlinked stage-2 paging structure to be freed. * @level: Level of the stage-2 paging structure to be freed. * * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); /** * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @phys: Physical address of the memory to map. * @level: Starting level of the stage-2 paging structure to be created. * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @force_pte: Force mappings to PAGE_SIZE granularity. * * Returns an unlinked page-table tree. This new page-table tree is * not reachable (i.e., it is unlinked) from the root pgd and it's * therefore unreachableby the hardware page-table walker. No TLB * invalidation or CMOs are performed. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * * Return: The fully populated (unlinked) stage-2 paging structure, or * an ERR_PTR(error) on failure. */ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address at which to place the mapping. * @size: Size of the mapping. * @phys: Physical address of the memory to map. * @prot: Permissions and attributes for the mapping. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored, @size is rounded-up to * the next page boundary and @phys is rounded-down to the previous page * boundary. * * If device attributes are not explicitly requested in @prot, then the * mapping will be normal, cacheable. * * Note that the update of a valid leaf PTE in this function will be aborted, * if it's trying to recreate the exact same mapping or only change the access * permissions. Instead, the vCPU will exit one more time from guest if still * needed and then go through the path of relaxing permissions. * * Note that this function will both coalesce existing table entries and split * existing block mappings, relying on page-faults to fault back areas outside * of the new mapping lazily. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to * track ownership. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Base intermediate physical address to annotate. * @size: Size of the annotated range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * @owner_id: Unique identifier for the owner of the page. * * By default, all page-tables are owned by identifier 0. This function can be * used to mark portions of the IPA space as owned by other entities. When a * stage 2 is used with identity-mappings, these annotations allow to use the * page-table data structure as a simple rmap. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc, u8 owner_id); /** * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to remove the mapping. * @size: Size of the mapping. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * TLB invalidation is performed for each page-table entry cleared during the * unmapping operation and the reference count for the page-table page * containing the cleared entry is decremented, with unreferenced pages being * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if * FWB is not supported by the CPU. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range * without TLB invalidation. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to write-protect, * @size: Size of the range. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * Note that it is the caller's responsibility to invalidate the TLB after * calling this function to ensure that the updated permissions are visible * to the CPUs. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * * If there is a valid, leaf page-table entry used to translate @addr, then * set the access flag in that entry. */ void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access * flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @size: Size of the address range to visit. * @mkold: True if the access flag should be cleared. * * The offset of @addr within a page is ignored. * * Tests and conditionally clears the access flag for every valid, leaf * page-table entry used to translate the range [@addr, @addr + @size). * * Note that it is the caller's responsibility to invalidate the TLB after * calling this function to ensure that the updated permissions are visible * to the CPUs. * * Return: True if any of the visited PTEs had the access flag set. */ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold); /** * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a * page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @prot: Additional permissions to grant for the mapping. * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and * TLB invalidation is performed after updating the entry. Software bits cannot * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point * of Coherency for guest stage-2 address * range. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address from which to flush. * @size: Size of the range. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); /** * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing * to PAGE_SIZE guest pages. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). * @addr: Intermediate physical address from which to split. * @size: Size of the range. * @mc: Cache of pre-allocated and zeroed memory from which to allocate * page-table pages. * * The function tries to split any level 1 or 2 entry that overlaps * with the input range (given by @addr and @size). * * Return: 0 on success, negative error code on failure. Note that * kvm_pgtable_stage2_split() is best effort: it tries to break as many * blocks in the input range as allowed by @mc_capacity. */ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc); /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). * @addr: Input address for the start of the walk. * @size: Size of the range to walk. * @walker: Walker callback description. * * The offset of @addr within a page is ignored and @size is rounded-up to * the next page boundary. * * The walker will walk the page-table entries corresponding to the input * address range specified, visiting entries according to the walker flags. * Invalid entries are treated as leaf entries. The visited page table entry is * reloaded after invoking the walker callback, allowing the walker to descend * into a newly installed table. * * Returning a negative error code from the walker callback function will * terminate the walk immediately with the same error code. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); /** * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry * with its level. * @pgt: Page-table structure initialised by kvm_pgtable_*_init() * or a similar initialiser. * @addr: Input address for the start of the walk. * @ptep: Pointer to storage for the retrieved PTE. * @level: Pointer to storage for the level of the retrieved PTE. * * The offset of @addr within a page is ignored. * * The walker will walk the page-table entries corresponding to the input * address specified, retrieving the leaf corresponding to this address. * Invalid entries are treated as leaf entries. * * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, s8 *level); /** * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a * stage-2 Page-Table Entry. * @pte: Page-table entry * * Return: protection attributes of the page-table entry in the enum * kvm_pgtable_prot format. */ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); /** * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 * Page-Table Entry. * @pte: Page-table entry * * Return: protection attributes of the page-table entry in the enum * kvm_pgtable_prot format. */ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); /** * kvm_tlb_flush_vmid_range() - Invalidate/flush a range of TLB entries * * @mmu: Stage-2 KVM MMU struct * @addr: The base Intermediate physical address from which to invalidate * @size: Size of the range from the base to invalidate */ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size); #endif /* __ARM64_KVM_PGTABLE_H__ */
15 15 15 285 285 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ARM64_KVM_HYP_FAULT_H__ #define __ARM64_KVM_HYP_FAULT_H__ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> static inline bool __fault_safe_to_translate(u64 esr) { u64 fsc = esr & ESR_ELx_FSC; if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr)) return false; return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV)); } static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { int ret; u64 par, tmp; /* * Resolve the IPA the hard way using the guest VA. * * Stage-1 translation already validated the memory access * rights. As such, we can use the EL1 translation regime, and * don't have to distinguish between EL0 and EL1 access. * * We do need to save/restore PAR_EL1 though, as we haven't * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) : __kvm_at(OP_AT_S1E1R, far); if (!ret) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ write_sysreg(par, par_el1); if (unlikely(tmp & SYS_PAR_EL1_F)) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ *hpfar = PAR_TO_HPFAR(tmp); return true; } /* * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR. */ static inline bool __hpfar_valid(u64 esr) { /* * CPUs affected by ARM erratum #834220 may incorrectly report a * stage-2 translation fault when a stage-1 permission fault occurs. * * Re-walk the page tables to determine if a stage-1 fault actually * occurred. */ if (cpus_have_final_cap(ARM64_WORKAROUND_834220) && esr_fsc_is_translation_fault(esr)) return false; if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr)) return true; if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr)) return true; return esr_fsc_is_addr_sz_fault(esr); } static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) { u64 hpfar; fault->far_el2 = read_sysreg_el2(SYS_FAR); fault->hpfar_el2 = 0; if (__hpfar_valid(esr)) hpfar = read_sysreg(hpfar_el2); else if (unlikely(!__fault_safe_to_translate(esr))) return true; else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar)) return false; /* * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid * HPFAR value. */ fault->hpfar_el2 = hpfar | HPFAR_EL2_NS; return true; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_FLS64_H_ #define _ASM_GENERIC_BITOPS_FLS64_H_ #include <asm/types.h> /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #if BITS_PER_LONG == 32 static __always_inline int fls64(__u64 x) { __u32 h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #elif BITS_PER_LONG == 64 static __always_inline int fls64(__u64 x) { if (x == 0) return 0; return __fls(x) + 1; } #else #error BITS_PER_LONG not 32 or 64 #endif #endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/minmax.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 /* Maximum size of a raw fscrypt master key */ #define FSCRYPT_MAX_RAW_KEY_SIZE 64 /* Maximum size of a hardware-wrapped fscrypt master key */ #define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE /* Maximum size of an fscrypt master key across both key types */ #define FSCRYPT_MAX_ANY_KEY_SIZE \ MAX(FSCRYPT_MAX_RAW_KEY_SIZE, FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE) /* * FSCRYPT_MAX_KEY_SIZE is defined in the UAPI header, but the addition of * hardware-wrapped keys has made it misleading as it's only for raw keys. * Don't use it in kernel code; use one of the above constants instead. */ #undef FSCRYPT_MAX_KEY_SIZE #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 log2_data_unit_size; u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } static inline int fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { return policy->log2_data_unit_size ?: inode->i_blkbits; } static inline int fscrypt_policy_du_bits(const union fscrypt_policy *policy, const struct inode *inode) { switch (policy->version) { case FSCRYPT_POLICY_V1: return inode->i_blkbits; case FSCRYPT_POLICY_V2: return fscrypt_policy_v2_du_bits(&policy->v2, inode); } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ u8 ci_inlinecrypt : 1; #endif /* True if ci_dirhash_key is initialized */ u8 ci_dirhash_key_initialized : 1; /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is * cached here for efficiency. Only used for regular files. */ u8 ci_data_unit_bits; /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* zero-based index of data unit within the file */ __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci); /* * Return the number of bits used by the maximum file data unit index that is * possible on the given filesystem, using the given log2 data unit size. */ static inline int fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) { return fls64(sb->s_maxbytes - 1) - du_bits; } /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ #define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \ 8 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci, bool is_hw_wrapped_key); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *key_bytes, size_t key_size, bool is_hw_wrapped, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key, size_t wrapped_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci, bool is_hw_wrapped_key) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *key_bytes, size_t key_size, bool is_hw_wrapped, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline int fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key, size_t wrapped_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) { fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys"); return -EOPNOTSUPP; } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * The KDF with which subkeys of this key can be derived. * * For v1 policy keys, this isn't applicable and won't be set. * Otherwise, this KDF will be keyed by this master key if * ->is_hw_wrapped=false, or by the "software secret" that hardware * derived from this master key if ->is_hw_wrapped=true. */ struct fscrypt_hkdf hkdf; /* * True if this key is a hardware-wrapped key; false if this key is a * raw key (i.e. a "software key"). For v1 policy keys this will always * be false, as v1 policy support is a legacy feature which doesn't * support newer functionality such as hardware-wrapped keys. */ bool is_hw_wrapped; /* * Size of the key in bytes. This remains set even if ->bytes was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* * The bytes of the key, when still needed. This can be either a raw * key or a hardware-wrapped key, as indicated by ->is_hw_wrapped. In * the case of a raw, v2 policy key, there is no need to remember the * actual key separately from ->hkdf so this field will be zeroized as * soon as ->hkdf is initialized. */ u8 bytes[FSCRYPT_MAX_ANY_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem. There are three high-level states that a key can be in: * * FSCRYPT_KEY_STATUS_PRESENT * Key is fully usable; it can be used to unlock inodes that are encrypted * with it (this includes being able to create new inodes). ->mk_present * indicates whether the key is in this state. ->mk_secret exists, the key * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. * * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED * Removal of this key has been initiated, but some inodes that were * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, * and the key can no longer be used to unlock inodes. Unlike ABSENT, the * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. * * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. * * FSCRYPT_KEY_STATUS_ABSENT * Key is fully removed. The key is no longer in the keyring, * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_present being true, and * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. Wiped as soon as it is no longer needed; * for details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; /* * Whether this key is in the "present" state, i.e. fully usable. For * details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem, but can be read locklessly using * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers * are possible. */ bool mk_present; } __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings( struct fscrypt_inode_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMEKEEPING_H #define _LINUX_TIMEKEEPING_H #include <linux/errno.h> #include <linux/clocksource_ids.h> #include <linux/ktime.h> /* Included from linux/ktime.h */ void timekeeping_init(void); extern int timekeeping_suspended; /* Architecture timer tick functions: */ extern void legacy_timer_tick(unsigned long ticks); /* * Get and set timeofday */ extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. */ /* * timespec64 based interfaces */ extern void ktime_get_raw_ts64(struct timespec64 *ts); extern void ktime_get_ts64(struct timespec64 *ts); extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); /* * time64_t base interfaces */ extern time64_t ktime_get_seconds(void); extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); /* * ktime_t based interfaces */ enum tk_offsets { TK_OFFS_REAL, TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format * * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { return ktime_get_with_offset(TK_OFFS_REAL); } static inline ktime_t ktime_get_coarse_real(void) { return ktime_get_coarse_with_offset(TK_OFFS_REAL); } /** * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. * * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { return ktime_get_with_offset(TK_OFFS_BOOT); } static inline ktime_t ktime_get_coarse_boottime(void) { return ktime_get_coarse_with_offset(TK_OFFS_BOOT); } /** * ktime_get_clocktai - Get the TAI time of day in ktime_t format * * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { return ktime_get_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse_clocktai(void) { return ktime_get_coarse_with_offset(TK_OFFS_TAI); } static inline ktime_t ktime_get_coarse(void) { struct timespec64 ts; ktime_get_coarse_ts64(&ts); return timespec64_to_ktime(ts); } static inline u64 ktime_get_coarse_ns(void) { return ktime_to_ns(ktime_get_coarse()); } static inline u64 ktime_get_coarse_real_ns(void) { return ktime_to_ns(ktime_get_coarse_real()); } static inline u64 ktime_get_coarse_boottime_ns(void) { return ktime_to_ns(ktime_get_coarse_boottime()); } static inline u64 ktime_get_coarse_clocktai_ns(void) { return ktime_to_ns(ktime_get_coarse_clocktai()); } /** * ktime_mono_to_real - Convert monotonic time to clock realtime * @mono: monotonic time to convert * * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } /** * ktime_get_ns - Get the current time in nanoseconds * * Returns: current time converted to nanoseconds */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } /** * ktime_get_real_ns - Get the current real/wall time in nanoseconds * * Returns: current real time converted to nanoseconds */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } /** * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds * * Returns: current boottime converted to nanoseconds */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } /** * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds * * Returns: current TAI time converted to nanoseconds */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } /** * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds * * Returns: current raw monotonic time converted to nanoseconds */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); } extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_tai_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* * timespec64/time64_t interfaces utilizing the ktime based ones * for API completeness, these could be implemented more efficiently * if needed. */ static inline void ktime_get_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_boottime()); } static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_boottime()); } static inline time64_t ktime_get_boottime_seconds(void) { return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC); } static inline void ktime_get_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); } static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_coarse_clocktai()); } static inline time64_t ktime_get_clocktai_seconds(void) { return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC); } /* * RTC specific */ extern bool timekeeping_rtc_skipsuspend(void); extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { u64 cycles; ktime_t real; ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; }; /** * struct system_device_crosststamp - system/device cross-timestamp * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time */ struct system_device_crosststamp { ktime_t device; ktime_t sys_realtime; ktime_t sys_monoraw; }; /** * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value * @cs_id: Clocksource ID corresponding to system counter value. Used by * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. * @use_nsecs: @cycles is in nanoseconds. */ struct system_counterval_t { u64 cycles; enum clocksource_ids cs_id; bool use_nsecs; }; extern bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles); extern bool timekeeping_clocksource_has_base(enum clocksource_ids id); /* * Get cross timestamp between system clock and device clock */ extern int get_device_system_crosststamp( int (*get_time_fn)(ktime_t *device_time, struct system_counterval_t *system_counterval, void *ctx), void *ctx, struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); /* * Persistent clock related interfaces */ extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, struct timespec64 *boot_offset); #ifdef CONFIG_GENERIC_CMOS_UPDATE extern int update_persistent_clock64(struct timespec64 now); #endif #endif
6 6 6 6 5 5 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { bool is_full_dev = &dev->coredev == coredev; /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; /* * Don't expose hw counters outside of the init namespace. */ if (!is_full_dev && dev->hw_stats_attr_index) coredev->dev.groups[dev->hw_stats_attr_index] = NULL; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; down_read(&devices_rwsem); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) goto out; } out: up_read(&devices_rwsem); } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup);
1 6 1 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); spin_lock_init(&nh->lock); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; u16 weight; int i; *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { weight = nhg->nh_entries[i].weight - 1; *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, .weight = weight, .weight_high = weight >> 8, }; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh) + nla_total_size(4) + /* NHA_OP_FLAGS */ 0; else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); struct nexthop_grp *nhg; unsigned int i, j; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } if (nexthop_grp_weight(&nhg[i]) == 0) { /* 0xffff got passed in, representing weight of 0x10000, * which is too heavy. */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } nhg = nla_data(tb[NHA_GROUP]); for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int len; unsigned int i; u8 nhg_fdb; len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); nhg_fdb = !!tb[NHA_FDB]; for (i = 0; i < len; i++) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rcu_dereference_rtnl(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { u16 prev_upper_bound = 0; u32 total = 0; u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u16 upper_bound; u64 btw; w += nhge->weight; btw = ((u64)res_table->num_nh_buckets) * w; upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { u32 total = 0; u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); spin_lock_bh(&nh->lock); nh->dead = true; while (!list_empty(&nh->f6i_list)) { f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); spin_lock_bh(&nh->lock); } spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **tb, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); int err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb, struct nh_config *cfg, struct netlink_ext_ack *extack) { if (tb[NHA_GROUP]) return nh_check_attr_group_rtnl(net, tb, extack); if (tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); return -ENETDOWN; } } return 0; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) goto out; err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack); if (err) goto out; if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); err = -EINVAL; goto out; } rtnl_net_lock(net); err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack); if (err) goto unlock; nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); unlock: rtnl_net_unlock(net); out: return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; rtnl_net_lock(net); nh = nexthop_find_by_id(net, id); if (nh) remove_nexthop(net, nh, &nlinfo); else err = -ENOENT; rtnl_net_unlock(net); return err; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ASSERT_RTNL_NET(net); flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register_many(nexthop_rtnl_msg_handlers); return 0; } subsys_initcall(nexthop_init);
603 606 398 398 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
142 2 3 141 3 1 1 1 55 55 53 1 7 1 36 1 2 1 1 3 36 39 3 63 1 50 40 1 1 36 1 37 1 4 5 2 43 5 10 40 26 17 8 1 57 56 33 23 1 54 22 11 12 1 6 36 18 93 36 4 1 1 1 63 30 22 2 7 18 13 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0-only /* * VGIC: KVM DEVICE API * * Copyright (C) 2015 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm_host.h> #include <kvm/arm_vgic.h> #include <linux/uaccess.h> #include <asm/kvm_mmu.h> #include <asm/cputype.h> #include "vgic.h" /* common helpers */ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, phys_addr_t addr, phys_addr_t alignment, phys_addr_t size) { if (!IS_VGIC_ADDR_UNDEF(ioaddr)) return -EEXIST; if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) return -EINVAL; if (addr + size < addr) return -EINVAL; if (addr & ~kvm_phys_mask(&kvm->arch.mmu) || (addr + size) > kvm_phys_size(&kvm->arch.mmu)) return -E2BIG; return 0; } static int vgic_check_type(struct kvm *kvm, int type_needed) { if (kvm->arch.vgic.vgic_model != type_needed) return -ENODEV; else return 0; } int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { struct vgic_dist *vgic = &kvm->arch.vgic; int r; mutex_lock(&kvm->arch.config_lock); switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); if (!r) r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, SZ_4K, KVM_VGIC_V2_DIST_SIZE); if (!r) vgic->vgic_dist_base = dev_addr->addr; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); if (!r) r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, SZ_4K, KVM_VGIC_V2_CPU_SIZE); if (!r) vgic->vgic_cpu_base = dev_addr->addr; break; default: r = -ENODEV; } mutex_unlock(&kvm->arch.config_lock); return r; } /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct * @attr: pointer to the attribute being retrieved/updated * @write: if true set the address in the VM address space, if false read the * address * * Set or get the vgic base addresses for the distributor and the virtual CPU * interface in the VM physical address space. These addresses are properties * of the emulated core/SoC and therefore user space initially knows this * information. * Check them for sanity (alignment, double assignment). We can't check for * overlapping regions in case of a virtual GICv3 here, since we don't know * the number of VCPUs yet, so we defer this check to map_resources(). */ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) { u64 __user *uaddr = (u64 __user *)attr->addr; struct vgic_dist *vgic = &kvm->arch.vgic; phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; u64 addr; int r; /* Reading a redistributor region addr implies getting the index */ if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) if (get_user(addr, uaddr)) return -EFAULT; /* * Since we can't hold config_lock while registering the redistributor * iodevs, take the slots_lock immediately. */ mutex_lock(&kvm->slots_lock); switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_4K; size = KVM_VGIC_V2_DIST_SIZE; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; size = KVM_VGIC_V2_CPU_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_64K; size = KVM_VGIC_V3_DIST_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_REDIST: { struct vgic_redist_region *rdreg; r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); if (r) break; if (write) { r = vgic_v3_set_redist_base(kvm, 0, addr, 0); goto out; } rdreg = list_first_entry_or_null(&vgic->rd_regions, struct vgic_redist_region, list); if (!rdreg) addr_ptr = &undef_value; else addr_ptr = &rdreg->base; break; } case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: { struct vgic_redist_region *rdreg; u8 index; r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); if (r) break; index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; if (write) { gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); if (!count || flags) r = -EINVAL; else r = vgic_v3_set_redist_base(kvm, index, base, count); goto out; } rdreg = vgic_v3_rdist_region_from_index(kvm, index); if (!rdreg) { r = -ENOENT; goto out; } addr = index; addr |= rdreg->base; addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; goto out; } default: r = -ENODEV; } if (r) goto out; mutex_lock(&kvm->arch.config_lock); if (write) { r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) *addr_ptr = addr; } else { addr = *addr_ptr; } mutex_unlock(&kvm->arch.config_lock); out: mutex_unlock(&kvm->slots_lock); if (!r && !write) r = put_user(addr, uaddr); return r; } static int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: r = kvm_vgic_addr(dev->kvm, attr, true); return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 val; int ret = 0; if (get_user(val, uaddr)) return -EFAULT; /* * We require: * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs * - at most 1024 interrupts * - a multiple of 32 interrupts */ if (val < (VGIC_NR_PRIVATE_IRQS + 32) || val > VGIC_MAX_RESERVED || (val & 31)) return -EINVAL; mutex_lock(&dev->kvm->arch.config_lock); /* * Either userspace has already configured NR_IRQS or * the vgic has already been initialized and vgic_init() * supplied a default amount of SPIs. */ if (dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; else dev->kvm->arch.vgic.nr_spis = val - VGIC_NR_PRIVATE_IRQS; mutex_unlock(&dev->kvm->arch.config_lock); return ret; } case KVM_DEV_ARM_VGIC_GRP_CTRL: { switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: mutex_lock(&dev->kvm->arch.config_lock); r = vgic_init(dev->kvm); mutex_unlock(&dev->kvm->arch.config_lock); return r; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: /* * OK, this one isn't common at all, but we * want to handle all control group attributes * in a single place. */ if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) return -ENXIO; mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); r = vgic_v3_save_pending_tables(dev->kvm); mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; } break; } } return -ENXIO; } static int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = -ENXIO; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: r = kvm_vgic_addr(dev->kvm, attr, false); return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; r = put_user(dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS, uaddr); break; } } return r; } static int vgic_create(struct kvm_device *dev, u32 type) { return kvm_vgic_create(dev->kvm, type); } static void vgic_destroy(struct kvm_device *dev) { kfree(dev); } int kvm_register_vgic_device(unsigned long type) { int ret = -ENODEV; switch (type) { case KVM_DEV_TYPE_ARM_VGIC_V2: ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); break; case KVM_DEV_TYPE_ARM_VGIC_V3: ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); if (ret) break; ret = kvm_vgic_register_its_device(); break; } return ret; } int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); if (!reg_attr->vcpu) return -EINVAL; return 0; } /** * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state * * @dev: kvm device handle * @attr: kvm device attribute * @is_write: true if userspace is writing a register */ static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; int ret; u32 val; ret = vgic_v2_parse_attr(dev, attr, &reg_attr); if (ret) return ret; vcpu = reg_attr.vcpu; addr = reg_attr.addr; if (is_write) if (get_user(val, uaddr)) return -EFAULT; mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); ret = vgic_init(dev->kvm); if (ret) goto out; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); break; default: ret = -EINVAL; break; } out: mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && !is_write) ret = put_user(val, uaddr); return ret; } static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, true); default: return vgic_set_common_attr(dev, attr); } } static int vgic_v2_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, false); default: return vgic_get_common_attr(dev, attr); } } static int vgic_v2_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: case KVM_VGIC_V2_ADDR_TYPE_CPU: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: return 0; case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; } } return -ENXIO; } struct kvm_device_ops kvm_arm_vgic_v2_ops = { .name = "kvm-arm-vgic-v2", .create = vgic_create, .destroy = vgic_destroy, .set_attr = vgic_v2_set_attr, .get_attr = vgic_v2_get_attr, .has_attr = vgic_v2_has_attr, }; int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { unsigned long vgic_mpidr, mpidr_reg; /* * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, * attr might not hold MPIDR. Hence assume vcpu0. */ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); } else { reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); } if (!reg_attr->vcpu) return -EINVAL; reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; return 0; } /* * Allow access to certain ID-like registers prior to VGIC initialization, * thereby allowing the VMM to provision the features / sizing of the VGIC. */ static bool reg_allowed_pre_init(struct kvm_device_attr *attr) { if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) return false; switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) { case GICD_IIDR: case GICD_TYPER2: return true; default: return false; } } /* * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state * * @dev: kvm device handle * @attr: kvm device attribute * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; bool uaccess; u32 val; int ret; ret = vgic_v3_parse_attr(dev, attr, &reg_attr); if (ret) return ret; vcpu = reg_attr.vcpu; addr = reg_attr.addr; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: /* Sysregs uaccess is performed by the sysreg handling code */ uaccess = false; break; default: uaccess = true; } if (uaccess && is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; if (get_user(val, uaddr)) return -EFAULT; } mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) { ret = -EBUSY; goto out; } switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); break; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { unsigned int info, intid; info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, intid, &val); } else { ret = -EINVAL; } break; } default: ret = -EINVAL; break; } out: mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && uaccess && !is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; ret = put_user(val, uaddr); } return ret; } static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { u32 __user *uaddr = (u32 __user *)attr->addr; u32 val; if (get_user(val, uaddr)) return -EFAULT; guard(mutex)(&dev->kvm->arch.config_lock); if (vgic_initialized(dev->kvm)) return -EBUSY; if (!irq_is_ppi(val)) return -EINVAL; dev->kvm->arch.vgic.mi_intid = val; return 0; } default: return vgic_set_common_attr(dev, attr); } } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, false); case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; guard(mutex)(&dev->kvm->arch.config_lock); return put_user(dev->kvm->arch.vgic.mi_intid, uaddr); } default: return vgic_get_common_attr(dev, attr); } } static int vgic_v3_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { case KVM_VGIC_V3_ADDR_TYPE_DIST: case KVM_VGIC_V3_ADDR_TYPE_REDIST: case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return 0; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == VGIC_LEVEL_INFO_LINE_LEVEL) return 0; break; } case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: return 0; } } return -ENXIO; } struct kvm_device_ops kvm_arm_vgic_v3_ops = { .name = "kvm-arm-vgic-v3", .create = vgic_create, .destroy = vgic_destroy, .set_attr = vgic_v3_set_attr, .get_attr = vgic_v3_get_attr, .has_attr = vgic_v3_has_attr, };
296 295 295 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. It does not take * associated threaded handlers into account. * * Do not use this for shutdown scenarios where you must be sure that all * parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when called * with interrupts disabled and the target CPU of the interrupt is the * current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * Just set IRQTF_AFFINITY and delegate the affinity setting to the * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as * we hold desc->lock and this code can be called from hard interrupt * context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; bool activated; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) return -EBUSY; /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) return -EBUSY; /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); return 0; } return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); if (!desc) return -EINVAL; guard(raw_spinlock_irqsave)(&desc->lock); return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { int ret = -EINVAL; scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { scoped_irqdesc->affinity_hint = m; ret = 0; } if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; scoped_guard(raw_spinlock_irqsave, &desc->lock) { if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); } notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(&notify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is freed * using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(&notify->kref); INIT_WORK(&notify->work, irq_affinity_notify); } scoped_guard(raw_spinlock_irqsave, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU affinity for * an irq. The vCPU specific data is passed from outside, such as KVM. One * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { scoped_irqdesc_get_and_lock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; struct irq_data *data; struct irq_chip *chip; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; data = irqd_get_parent_data(data); } while (data); if (!data) return -ENOSYS; return chip->irq_set_vcpu_affinity(data, vcpu_info); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { __disable_irq(scoped_irqdesc); return 0; } return -EINVAL; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending IRQ handlers for this interrupt to * complete before returning. If you use this function while holding a * resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when an * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are nested. * * This function waits for any pending hard IRQ handlers for this interrupt * to complete before returning. If you use this function while holding a * resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context the * return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are nested. * * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this matches the * last disable, processing of interrupts on this IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) return; __enable_irq(desc); } } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this matches the last * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is disabled by * default. Enables and disables must match, just as they match for * non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep states like * "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal to the * enable/disable state of irq wake. An irq can be disabled with * disable_irq() and still wake the system as long as the irq has wake * enabled. If this does not hold, then the underlying irq chip and the * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; int ret = 0; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) return -EINVAL; /* * wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } return ret; } return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ bool can_request_irq(unsigned int irq, unsigned long irqflags) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) return true; } } return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { scoped_irqdesc_get_and_lock(irq, 0) { scoped_irqdesc->parent_irq = parent_irq; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } scoped_guard(raw_spinlock_irq, &desc->lock) { /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } } if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ scoped_guard(raw_spinlock_irqsave, &desc->lock) irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the interrupt * line is no longer in use by any driver it is disabled. On a shared IRQ * the caller must ensure the interrupt is disabled on the card it drives * before calling this function. The function does not return until any * executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); return __cleanup_nmi(irq, desc); } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. From the point this call is made your handler function * may be invoked. Since your handler function must clear any interrupt the * board raises, you must take care both to initialise your hardware and to * set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device then you * need to supply @handler and @thread_fn. @handler is still called in hard * interrupt context and has to check whether the interrupt originates from * the device. If yes it needs to disable the interrupt on the device and * return IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support shared * interrupts. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id as this is * required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It selects either a hardirq or threaded handling * method depending on the context. * * Returns: On failure, it returns a negative value. On success, it returns either * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt line * and IRQ handling. It sets up the IRQ line to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * @dev_id must be globally unique. Normally the address of the device data * structure is used as the cookie. Since the handler receives this value * it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function will fail * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) { /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); return -EINVAL; } return 0; } err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { struct irq_desc *desc = scoped_irqdesc; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { if (__irq_set_trigger(desc, type)) { WARN(1, "failed to set type for IRQ%d\n", irq); return; } } irq_percpu_enable(desc, smp_processor_id()); } } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); return NULL; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); return NULL; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but the * interrupt line is not disabled. This must be done on each CPU before * calling this function. The function does not return until any executing * interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the interrupt on the * local CPU. If the interrupt is supposed to be enabled on other CPUs, it * has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * @dev_id must be globally unique. It is a per-cpu variable, and the * handler gets called with the interrupted CPU's instance of that * variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc->istate |= IRQS_NMI; return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function will fail * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { int ret = -EINVAL; WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN(!irq_is_nmi(scoped_irqdesc), "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) return -EINVAL; ret = irq_nmi_setup(scoped_irqdesc); if (ret) pr_err("Failed to setup NMI delivery: irq %u\n", irq); } return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { WARN_ON(preemptible()); scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) return; irq_nmi_teardown(scoped_irqdesc); } } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an interrupt, * returning into @state the bit corresponding to stage @which * * This function should be called with preemption disabled if the interrupt * controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); return __irq_get_irqchip_state(data, which, state); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, depending on * the value of @which. * * This function should be called with migration disabled if the interrupt * controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); struct irq_chip *chip; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_set_irqchip_state) break; data = irqd_get_parent_data(data); } while (data); if (data) return chip->irq_set_irqchip_state(data, which, val); } return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit);
2 797 177 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header provides generic wrappers for memory access instrumentation that * the compiler cannot emit for: KASAN, KCSAN, KMSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> #include <linux/kmsan-checks.h> #include <linux/types.h> /** * instrument_read - instrument regular read access * @v: address of access * @size: size of access * * Instrument a regular read access. The instrumentation should be inserted * before the actual read happens. */ static __always_inline void instrument_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_read(v, size); } /** * instrument_write - instrument regular write access * @v: address of access * @size: size of access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. */ static __always_inline void instrument_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_write(v, size); } /** * instrument_read_write - instrument regular read-write access * @v: address of access * @size: size of access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. */ static __always_inline void instrument_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_read_write(v, size); } /** * instrument_atomic_read - instrument atomic read access * @v: address of access * @size: size of access * * Instrument an atomic read access. The instrumentation should be inserted * before the actual read happens. */ static __always_inline void instrument_atomic_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); } /** * instrument_atomic_write - instrument atomic write access * @v: address of access * @size: size of access * * Instrument an atomic write access. The instrumentation should be inserted * before the actual write happens. */ static __always_inline void instrument_atomic_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); } /** * instrument_atomic_read_write - instrument atomic read-write access * @v: address of access * @size: size of access * * Instrument an atomic read-write access. The instrumentation should be * inserted before the actual write happens. */ static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_read_write(v, size); } /** * instrument_copy_to_user - instrument reads of copy_to_user * @to: destination address * @from: source address * @n: number of bytes to copy * * Instrument reads from kernel memory, that are due to copy_to_user (and * variants). The instrumentation must be inserted before the accesses. */ static __always_inline void instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); kmsan_copy_to_user(to, from, n, 0); } /** * instrument_copy_from_user_before - add instrumentation before copy_from_user * @to: destination address * @from: source address * @n: number of bytes to copy * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. */ static __always_inline void instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } /** * instrument_copy_from_user_after - add instrumentation after copy_from_user * @to: destination address * @from: source address * @n: number of bytes to copy * @left: number of bytes not copied (as returned by copy_from_user) * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted after the accesses. */ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, unsigned long n, unsigned long left) { kmsan_unpoison_memory(to, n - left); } /** * instrument_memcpy_before - add instrumentation before non-instrumented memcpy * @to: destination address * @from: source address * @n: number of bytes to copy * * Instrument memory accesses that happen in custom memcpy implementations. The * instrumentation should be inserted before the memcpy call. */ static __always_inline void instrument_memcpy_before(void *to, const void *from, unsigned long n) { kasan_check_write(to, n); kasan_check_read(from, n); kcsan_check_write(to, n); kcsan_check_read(from, n); } /** * instrument_memcpy_after - add instrumentation after non-instrumented memcpy * @to: destination address * @from: source address * @n: number of bytes to copy * @left: number of bytes not copied (if known) * * Instrument memory accesses that happen in custom memcpy implementations. The * instrumentation should be inserted after the memcpy call. */ static __always_inline void instrument_memcpy_after(void *to, const void *from, unsigned long n, unsigned long left) { kmsan_memmove(to, from, n - left); } /** * instrument_get_user() - add instrumentation to get_user()-like macros * @to: destination variable, may not be address-taken * * get_user() and friends are fragile, so it may depend on the implementation * whether the instrumentation happens before or after the data is copied from * the userspace. */ #define instrument_get_user(to) \ ({ \ u64 __tmp = (u64)(to); \ kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \ to = __tmp; \ }) /** * instrument_put_user() - add instrumentation to put_user()-like macros * @from: source address * @ptr: userspace pointer to copy to * @size: number of bytes to copy * * put_user() and friends are fragile, so it may depend on the implementation * whether the instrumentation happens before or after the data is copied from * the userspace. */ #define instrument_put_user(from, ptr, size) \ ({ \ kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \ }) #endif /* _LINUX_INSTRUMENTED_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_MTE_H #define __ASM_MTE_H #include <asm/compiler.h> #include <asm/mte-def.h> #ifndef __ASSEMBLY__ #include <linux/bitfield.h> #include <linux/kasan-enabled.h> #include <linux/page-flags.h> #include <linux/sched.h> #include <linux/types.h> #include <asm/pgtable-types.h> void mte_clear_page_tags(void *addr); unsigned long mte_copy_tags_from_user(void *to, const void __user *from, unsigned long n); unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); void *mte_allocate_tag_storage(void); void mte_free_tag_storage(char *storage); #ifdef CONFIG_ARM64_MTE /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 /* simple lock to avoid multiple threads tagging the same page */ #define PG_mte_lock PG_arch_3 static inline void set_page_mte_tagged(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * Ensure that the tags written prior to this function are visible * before the page flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &page->flags); } static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } /* * Lock the page for tagging and return 'true' if the page can be tagged, * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the * locking only happens once for page initialisation. * * The page MTE lock state: * * Locked: PG_mte_lock && !PG_mte_tagged * Unlocked: !PG_mte_lock || PG_mte_tagged * * Acquire semantics only if the page is tagged (returning 'false'). */ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); return false; } void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t pte, unsigned int nr_pages); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); void mte_cpu_setup(void); void mte_suspend_enter(void); void mte_suspend_exit(void); long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 static inline void set_page_mte_tagged(struct page *page) { } static inline bool page_mte_tagged(struct page *page) { return false; } static inline bool try_page_mte_tagging(struct page *page) { return false; } static inline void mte_zero_clear_page_tags(void *addr) { } static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) { } static inline void mte_thread_init_user(void) { } static inline void mte_thread_switch(struct task_struct *next) { } static inline void mte_suspend_enter(void) { } static inline void mte_suspend_exit(void) { } static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg) { return 0; } static inline long get_mte_ctrl(struct task_struct *task) { return 0; } static inline int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data) { return -EIO; } #endif /* CONFIG_ARM64_MTE */ #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * Ensure that the tags written prior to this function are visible * before the folio flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &folio->flags); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { bool ret = test_bit(PG_mte_tagged, &folio->flags); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * If the folio is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); if (!test_and_set_bit(PG_mte_lock, &folio->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); return false; } #else static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { return false; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { return false; } #endif static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) return; /* * Re-enable tag checking (TCO set on exception entry). This is only * necessary if MTE is enabled in either the kernel or the userspace * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set * for both). With MTE disabled in the kernel and disabled or * asynchronous in userspace, tag check faults (including in uaccesses) * are not reported, therefore there is no need to re-enable checking. * This is beneficial on microarchitectures where re-enabling TCO is * expensive. */ if (kasan_hw_tags_enabled() || (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))) asm volatile(SET_PSTATE_TCO(0)); } #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { if (!kasan_hw_tags_enabled()) return; mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { if (!kasan_hw_tags_enabled()) return; /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() * is required. */ dsb(nsh); isb(); mte_check_tfsr_el1(); } #else static inline void mte_check_tfsr_el1(void) { } static inline void mte_check_tfsr_entry(void) { } static inline void mte_check_tfsr_exit(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* __ASSEMBLY__ */ #endif /* __ASM_MTE_H */
628 10 10 8 1 8 631 630 81 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 // SPDX-License-Identifier: GPL-2.0-only /* * Contains CPU feature definitions * * Copyright (C) 2015 ARM Ltd. * * A note for the weary kernel hacker: the code here is confusing and hard to * follow! That's partly because it's solving a nasty problem, but also because * there's a little bit of over-abstraction that tends to obscure what's going * on behind a maze of helper functions and macros. * * The basic problem is that hardware folks have started gluing together CPUs * with distinct architectural features; in some cases even creating SoCs where * user-visible instructions are available only on a subset of the available * cores. We try to address this by snapshotting the feature registers of the * boot CPU and comparing these with the feature registers of each secondary * CPU when bringing them up. If there is a mismatch, then we update the * snapshot state to indicate the lowest-common denominator of the feature, * known as the "safe" value. This snapshot state can be queried to view the * "sanitised" value of a feature register. * * The sanitised register values are used to decide which capabilities we * have in the system. These may be in the form of traditional "hwcaps" * advertised to userspace or internal "cpucaps" which are used to configure * things like alternative patching and static keys. While a feature mismatch * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch * may prevent a CPU from being onlined at all. * * Some implementation details worth remembering: * * - Mismatched features are *always* sanitised to a "safe" value, which * usually indicates that the feature is not supported. * * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK" * warning when onlining an offending CPU and the kernel will be tainted * with TAINT_CPU_OUT_OF_SPEC. * * - Features marked as FTR_VISIBLE have their sanitised value visible to * userspace. FTR_VISIBLE features in registers that are only visible * to EL0 by trapping *must* have a corresponding HWCAP so that late * onlining of CPUs cannot lead to features disappearing at runtime. * * - A "feature" is typically a 4-bit register field. A "capability" is the * high-level description derived from the sanitised field value. * * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID * scheme for fields in ID registers") to understand when feature fields * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly). * * - KVM exposes its own view of the feature registers to guest operating * systems regardless of FTR_VISIBLE. This is typically driven from the * sanitised register values to allow virtual CPUs to be migrated between * arbitrary physical CPUs, but some features not present on the host are * also advertised and emulated. Look at sys_reg_descs[] for the gory * details. * * - If the arm64_ftr_bits[] for a register has a missing field, then this * field is treated as STRICT RES0, including for read_sanitised_ftr_reg(). * This is stronger than FTR_HIDDEN and can be used to hide features from * KVM guests. */ #define pr_fmt(fmt) "CPU features: " fmt #include <linux/bsearch.h> #include <linux/cpumask.h> #include <linux/crash_dump.h> #include <linux/kstrtox.h> #include <linux/sort.h> #include <linux/stop_machine.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/kasan.h> #include <linux/percpu.h> #include <linux/sched/isolation.h> #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/fpsimd.h> #include <asm/hwcap.h> #include <asm/insn.h> #include <asm/kvm_host.h> #include <asm/mmu_context.h> #include <asm/mte.h> #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/smp.h> #include <asm/sysreg.h> #include <asm/traps.h> #include <asm/vectors.h> #include <asm/virt.h> /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\ COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\ COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\ COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); EXPORT_SYMBOL(system_cpucaps); static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NCAPS]; DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); /* * arm64_use_ng_mappings must be placed in the .data section, otherwise it * ends up in the .bss section where it is initialized in early_map_kernel() * after the MMU (with the idmap) was enabled. create_init_idmap() - which * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - * may end up generating an incorrect idmap page table attributes. */ bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; /* * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs * support it? */ static bool __read_mostly allow_mismatched_32bit_el0; /* * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have * seen at least one CPU capable of 32-bit EL0. */ DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); /* * Mask of CPUs supporting 32-bit EL0. * Only valid if arm64_mismatched_32bit_el0 is enabled. */ static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly; void dump_cpu_features(void) { /* file-wide pr_fmt adds "CPU features: " prefix */ pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps); } #define __ARM64_MAX_POSITIVE(reg, field) \ ((reg##_##field##_SIGNED ? \ BIT(reg##_##field##_WIDTH - 1) : \ BIT(reg##_##field##_WIDTH)) - 1) #define __ARM64_MIN_NEGATIVE(reg, field) BIT(reg##_##field##_WIDTH - 1) #define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value) \ .sys_reg = SYS_##reg, \ .field_pos = reg##_##field##_SHIFT, \ .field_width = reg##_##field##_WIDTH, \ .sign = reg##_##field##_SIGNED, \ .min_field_value = min_value, \ .max_field_value = max_value, /* * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to * an implicit maximum that depends on the sign-ess of the field. * * An unsigned field will be capped at all ones, while a signed field * will be limited to the positive half only. */ #define ARM64_CPUID_FIELDS(reg, field, min_value) \ __ARM64_CPUID_FIELDS(reg, field, \ SYS_FIELD_VALUE(reg, field, min_value), \ __ARM64_MAX_POSITIVE(reg, field)) /* * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an * implicit minimal value to max_value. This should be used when * matching a non-implemented property. */ #define ARM64_CPUID_FIELDS_NEG(reg, field, max_value) \ __ARM64_CPUID_FIELDS(reg, field, \ __ARM64_MIN_NEGATIVE(reg, field), \ SYS_FIELD_VALUE(reg, field, max_value)) #define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ { \ .sign = SIGNED, \ .visible = VISIBLE, \ .strict = STRICT, \ .type = TYPE, \ .shift = SHIFT, \ .width = WIDTH, \ .safe_val = SAFE_VAL, \ } /* Define a feature with unsigned values */ #define ARM64_FTR_BITS(VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ __ARM64_FTR_BITS(FTR_UNSIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) /* Define a feature with a signed value */ #define S_ARM64_FTR_BITS(VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \ __ARM64_FTR_BITS(FTR_SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) #define ARM64_FTR_END \ { \ .width = 0, \ } static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); static bool __system_matches_cap(unsigned int n); /* * NOTE: Any changes to the visibility of features should be kept in * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar2[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64isar3[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_EL1_IMP), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_EL0_IMP), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_DF2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16B16_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0), /* * Page size not being supported at Stage-2 is not fatal. You * just give up KVM if PAGE_SIZE isn't supported there. Go fix * your favourite nesting hypervisor. * * There is a small corner case where the hypervisor explicitly * advertises a given granule size at Stage-2 (value 2) on some * vCPUs, and uses the fallback to Stage-1 (value 0) for other * vCPUs. Although this is not forbidden by the architecture, it * indicates that the hypervisor is being silly (or buggy). * * We make no effort to cope with this and pretend that if these * fields are inconsistent across vCPUs, then it isn't worth * trying to bring KVM up. */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1), /* * We already refuse to boot CPUs that don't support our configured * page size, so we can only detect mismatches for a page size other * than the one we're currently using. Unfortunately, SoCs like this * exist in the wild so, even though we don't like it, we'll have to go * along with it and treat them as non-strict. */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0), /* Linux shouldn't care about secure memory */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0), /* * Differing PARange is fine as long as all peripherals and memory are mapped * within the minimum PARange of all CPUs */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = { ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE), FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_SCTLRX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_ctr[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1), /* * Linux can handle differing I-cache policies. Userspace JITs will * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0), ARM64_FTR_END, }; static struct arm64_ftr_override __ro_after_init no_override = { }; struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { .name = "SYS_CTR_EL0", .ftr_bits = ftr_ctr, .override = &no_override, }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_InnerShr_SHIFT, 4, 0xf), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_FCSE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_AuxReg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_TCM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_ShareLvl_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_OuterShr_SHIFT, 4, 0xf), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_PMSA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0), /* * We can instantiate multiple PMU instances with different levels * of support. */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPRound_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPShVec_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSqrt_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDivide_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPTrap_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_SIMDReg_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDFMAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPHP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDHP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDSP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDInt_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDLS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPDNaN_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPFtZ_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mvfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_FPMisc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_SIMDMisc_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_gmid[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_isar0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Divide_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Debug_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Coproc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_CmpBranch_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitField_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitCount_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Swap_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_isar5[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_RDM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_CRC32_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_AES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SEVL_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_mmfr4[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_EVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CCIDX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_LSM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_HPDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CnP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_XNX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_AC2_SHIFT, 4, 0), /* * SpecSEI = 1 indicates that the PE might generate an SError on an * external abort on speculative read. It is safe to assume that an * SError might be generated than it will not be. Hence it has been * classified as FTR_HIGHER_SAFE. */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_EL1_SpecSEI_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_isar4[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SWP_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_PSR_M_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SynchPrim_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Barrier_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SMC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Writeback_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_WithShifts_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Unpriv_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_mmfr5[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_EL1_ETS_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_isar6[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_BF16_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SPECRES_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_DP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_JSCVT_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State1_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State0_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GIC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virt_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Sec_frac_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GenTimer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virtualization_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_MProgMod_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Security_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_ProgMod_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_pfr2[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_SSBS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_CSV3_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopSDbg_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopDbg_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_dfr1[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_EL1_MTPMU_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_mpamidr[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), ARM64_FTR_END, }; /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of * 0. Covers the following 32bit registers: * id_isar[1-3], id_mmfr[1-3] */ static const struct arm64_ftr_bits ftr_generic_32bits[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), ARM64_FTR_END, }; /* Table for a single 32bit feature value */ static const struct arm64_ftr_bits ftr_single32[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 0, 32, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_raz[] = { ARM64_FTR_END, }; #define __ARM64_FTR_REG_OVERRIDE(id_str, id, table, ovr) { \ .sys_id = id, \ .reg = &(struct arm64_ftr_reg){ \ .name = id_str, \ .override = (ovr), \ .ftr_bits = &((table)[0]), \ }} #define ARM64_FTR_REG_OVERRIDE(id, table, ovr) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, ovr) #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; struct arm64_ftr_override __read_mostly id_aa64pfr0_override; struct arm64_ftr_override __read_mostly id_aa64pfr1_override; struct arm64_ftr_override __read_mostly id_aa64zfr0_override; struct arm64_ftr_override __read_mostly id_aa64smfr0_override; struct arm64_ftr_override __read_mostly id_aa64isar1_override; struct arm64_ftr_override __read_mostly id_aa64isar2_override; struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; } arm64_ftr_regs[] = { /* Op1 = 0, CRn = 0, CRm = 1 */ ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0), ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1), ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0), ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0), ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_MMFR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits), /* Op1 = 0, CRn = 0, CRm = 2 */ ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0), ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits), ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4), ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5), ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4), ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6), /* Op1 = 0, CRn = 0, CRm = 3 */ ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0), ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1), ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2), ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2), ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1), ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5), /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0, &id_aa64pfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1, &id_aa64pfr1_override), ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0, &id_aa64zfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0, &id_aa64smfr0_override), ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), ARM64_FTR_REG(SYS_ID_AA64DFR1_EL1, ftr_raz), /* Op1 = 0, CRn = 0, CRm = 6 */ ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, &id_aa64isar1_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, &id_aa64isar2_override), ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3), /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0, &id_aa64mmfr0_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1, &id_aa64mmfr1_override), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2, &id_aa64mmfr2_override), ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), /* Op1 = 0, CRn = 10, CRm = 4 */ ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), /* Op1 = 3, CRn = 0, CRm = 0 */ { SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 }, ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid), /* Op1 = 3, CRn = 14, CRm = 0 */ ARM64_FTR_REG(SYS_CNTFRQ_EL0, ftr_single32), }; static int search_cmp_ftr_reg(const void *id, const void *regp) { return (int)(unsigned long)id - (int)((const struct __ftr_reg_entry *)regp)->sys_id; } /* * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the * ascending order of sys_id, we use binary search to find a matching * entry. * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure. It is upto the caller to decide * the impact of a failure. */ static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id) { const struct __ftr_reg_entry *ret; ret = bsearch((const void *)(unsigned long)sys_id, arm64_ftr_regs, ARRAY_SIZE(arm64_ftr_regs), sizeof(arm64_ftr_regs[0]), search_cmp_ftr_reg); if (ret) return ret->reg; return NULL; } /* * get_arm64_ftr_reg - Looks up a feature register entry using * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn(). * * returns - Upon success, matching ftr_reg entry for id. * - NULL on failure but with an WARN_ON(). */ struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id) { struct arm64_ftr_reg *reg; reg = get_arm64_ftr_reg_nowarn(sys_id); /* * Requesting a non-existent register search is an error. Warn * and let the caller handle it. */ WARN_ON(!reg); return reg; } static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, s64 ftr_val) { u64 mask = arm64_ftr_mask(ftrp); reg &= ~mask; reg |= (ftr_val << ftrp->shift) & mask; return reg; } s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { s64 ret = 0; switch (ftrp->type) { case FTR_EXACT: ret = ftrp->safe_val; break; case FTR_LOWER_SAFE: ret = min(new, cur); break; case FTR_HIGHER_OR_ZERO_SAFE: if (!cur || !new) break; fallthrough; case FTR_HIGHER_SAFE: ret = max(new, cur); break; default: BUG(); } return ret; } static void __init sort_ftr_regs(void) { unsigned int i; for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; unsigned int j = 0; /* * Features here must be sorted in descending order with respect * to their shift values and should not overlap with each other. */ for (; ftr_bits->width != 0; ftr_bits++, j++) { unsigned int width = ftr_reg->ftr_bits[j].width; unsigned int shift = ftr_reg->ftr_bits[j].shift; unsigned int prev_shift; WARN((shift + width) > 64, "%s has invalid feature at shift %d\n", ftr_reg->name, shift); /* * Skip the first feature. There is nothing to * compare against for now. */ if (j == 0) continue; prev_shift = ftr_reg->ftr_bits[j - 1].shift; WARN((shift + width) > prev_shift, "%s has feature overlap at shift %d\n", ftr_reg->name, shift); } /* * Skip the first register. There is nothing to * compare against for now. */ if (i == 0) continue; /* * Registers here must be sorted in ascending order with respect * to sys_id for subsequent binary search in get_arm64_ftr_reg() * to work correctly. */ BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); } } /* * Initialise the CPU feature register from Boot CPU values. * Also initiliases the strict_mask for the register. * Any bits that are not covered by an arm64_ftr_bits entry are considered * RES0 for the system-wide value, and must strictly match. */ static void init_cpu_ftr_reg(u32 sys_reg, u64 new) { u64 val = 0; u64 strict_mask = ~0x0ULL; u64 user_mask = 0; u64 valid_mask = 0; const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg); if (!reg) return; for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { u64 ftr_mask = arm64_ftr_mask(ftrp); s64 ftr_new = arm64_ftr_value(ftrp, new); s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val); if ((ftr_mask & reg->override->mask) == ftr_mask) { s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new); char *str = NULL; if (ftr_ovr != tmp) { /* Unsafe, remove the override */ reg->override->mask &= ~ftr_mask; reg->override->val &= ~ftr_mask; tmp = ftr_ovr; str = "ignoring override"; } else if (ftr_new != tmp) { /* Override was valid */ ftr_new = tmp; str = "forced"; } else { /* Override was the safe value */ str = "already set"; } pr_warn("%s[%d:%d]: %s to %llx\n", reg->name, ftrp->shift + ftrp->width - 1, ftrp->shift, str, tmp & (BIT(ftrp->width) - 1)); } else if ((ftr_mask & reg->override->val) == ftr_mask) { reg->override->val &= ~ftr_mask; pr_warn("%s[%d:%d]: impossible override, ignored\n", reg->name, ftrp->shift + ftrp->width - 1, ftrp->shift); } val = arm64_ftr_set_value(ftrp, val, ftr_new); valid_mask |= ftr_mask; if (!ftrp->strict) strict_mask &= ~ftr_mask; if (ftrp->visible) user_mask |= ftr_mask; else reg->user_val = arm64_ftr_set_value(ftrp, reg->user_val, ftrp->safe_val); } val &= valid_mask; reg->sys_val = val; reg->strict_mask = strict_mask; reg->user_mask = user_mask; } extern const struct arm64_cpu_capabilities arm64_errata[]; static const struct arm64_cpu_capabilities arm64_features[]; static void __init init_cpucap_indirect_list_from_array(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) { if (WARN(caps->capability >= ARM64_NCAPS, "Invalid capability %d\n", caps->capability)) continue; if (WARN(cpucap_ptrs[caps->capability], "Duplicate entry for capability %d\n", caps->capability)) continue; cpucap_ptrs[caps->capability] = caps; } } static void __init init_cpucap_indirect_list(void) { init_cpucap_indirect_list_from_array(arm64_features); init_cpucap_indirect_list_from_array(arm64_errata); } static void __init setup_boot_cpu_capabilities(void); static void init_32bit_cpu_features(struct cpuinfo_32bit *info) { init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1); init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0); init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1); init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2); init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3); init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4); init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5); init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6); init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0); init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1); init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2); init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3); init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4); init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5); init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0); init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1); init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2); init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0); init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1); init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2); } #ifdef CONFIG_ARM64_PSEUDO_NMI static bool enable_pseudo_nmi; static int __init early_enable_pseudo_nmi(char *p) { return kstrtobool(p, &enable_pseudo_nmi); } early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi); static __init void detect_system_supports_pseudo_nmi(void) { struct device_node *np; if (!enable_pseudo_nmi) return; /* * Detect broken MediaTek firmware that doesn't properly save and * restore GIC priorities. */ np = of_find_compatible_node(NULL, NULL, "arm,gic-v3"); if (np && of_property_read_bool(np, "mediatek,broken-save-restore-fw")) { pr_info("Pseudo-NMI disabled due to MediaTek Chromebook GICR save problem\n"); enable_pseudo_nmi = false; } of_node_put(np); } #else /* CONFIG_ARM64_PSEUDO_NMI */ static inline void detect_system_supports_pseudo_nmi(void) { } #endif void __init init_cpu_features(struct cpuinfo_arm64 *info) { /* Before we start using the tables, make sure it is sorted */ sort_ftr_regs(); init_cpu_ftr_reg(SYS_CTR_EL0, info->reg_ctr); init_cpu_ftr_reg(SYS_DCZID_EL0, info->reg_dczid); init_cpu_ftr_reg(SYS_CNTFRQ_EL0, info->reg_cntfrq); init_cpu_ftr_reg(SYS_ID_AA64DFR0_EL1, info->reg_id_aa64dfr0); init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1); init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0); init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2); init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3); init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2); init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0); init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0); init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0); if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) init_32bit_cpu_features(&info->aarch32); if (IS_ENABLED(CONFIG_ARM64_SVE) && id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sve(); vec_init_vq_map(ARM64_VEC_SVE); cpacr_restore(cpacr); } if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); vec_init_vq_map(ARM64_VEC_SME); cpacr_restore(cpacr); } if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) { const struct arm64_ftr_bits *ftrp; for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) { s64 ftr_cur = arm64_ftr_value(ftrp, reg->sys_val); s64 ftr_new = arm64_ftr_value(ftrp, new); if (ftr_cur == ftr_new) continue; /* Find a safe value */ ftr_new = arm64_ftr_safe_value(ftrp, ftr_new, ftr_cur); reg->sys_val = arm64_ftr_set_value(ftrp, reg->sys_val, ftr_new); } } static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); if (!regp) return 0; update_cpu_ftr_reg(regp, val); if ((boot & regp->strict_mask) == (val & regp->strict_mask)) return 0; pr_warn("SANITY CHECK: Unexpected variation in %s. Boot CPU: %#016llx, CPU%d: %#016llx\n", regp->name, boot, cpu, val); return 1; } static void relax_cpu_ftr_reg(u32 sys_id, int field) { const struct arm64_ftr_bits *ftrp; struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id); if (!regp) return; for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) { if (ftrp->shift == field) { regp->strict_mask &= ~arm64_ftr_mask(ftrp); break; } } /* Bogus field? */ WARN_ON(!ftrp->width); } static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info, struct cpuinfo_arm64 *boot) { static bool boot_cpu_32bit_regs_overridden = false; if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden) return; if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0)) return; boot->aarch32 = info->aarch32; init_32bit_cpu_features(&boot->aarch32); boot_cpu_32bit_regs_overridden = true; } static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info, struct cpuinfo_32bit *boot) { int taint = 0; u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); /* * If we don't have AArch32 at EL1, then relax the strictness of * EL1-dependent register fields to avoid spurious sanity check fails. */ if (!id_aa64pfr0_32bit_el1(pfr0)) { relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_EL1_SMC_SHIFT); relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virt_frac_SHIFT); relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Sec_frac_SHIFT); relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virtualization_SHIFT); relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Security_SHIFT); relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_ProgMod_SHIFT); } taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu, info->reg_id_dfr0, boot->reg_id_dfr0); taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu, info->reg_id_dfr1, boot->reg_id_dfr1); taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu, info->reg_id_isar0, boot->reg_id_isar0); taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu, info->reg_id_isar1, boot->reg_id_isar1); taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu, info->reg_id_isar2, boot->reg_id_isar2); taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu, info->reg_id_isar3, boot->reg_id_isar3); taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu, info->reg_id_isar4, boot->reg_id_isar4); taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu, info->reg_id_isar5, boot->reg_id_isar5); taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu, info->reg_id_isar6, boot->reg_id_isar6); /* * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and * ACTLR formats could differ across CPUs and therefore would have to * be trapped for virtualization anyway. */ taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu, info->reg_id_mmfr0, boot->reg_id_mmfr0); taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu, info->reg_id_mmfr1, boot->reg_id_mmfr1); taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu, info->reg_id_mmfr2, boot->reg_id_mmfr2); taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu, info->reg_id_mmfr3, boot->reg_id_mmfr3); taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu, info->reg_id_mmfr4, boot->reg_id_mmfr4); taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu, info->reg_id_mmfr5, boot->reg_id_mmfr5); taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu, info->reg_id_pfr0, boot->reg_id_pfr0); taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu, info->reg_id_pfr1, boot->reg_id_pfr1); taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu, info->reg_id_pfr2, boot->reg_id_pfr2); taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu, info->reg_mvfr0, boot->reg_mvfr0); taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu, info->reg_mvfr1, boot->reg_mvfr1); taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu, info->reg_mvfr2, boot->reg_mvfr2); return taint; } /* * Update system wide CPU feature registers with the values from a * non-boot CPU. Also performs SANITY checks to make sure that there * aren't any insane variations from that of the boot CPU. */ void update_cpu_features(int cpu, struct cpuinfo_arm64 *info, struct cpuinfo_arm64 *boot) { int taint = 0; /* * The kernel can handle differing I-cache policies, but otherwise * caches should look identical. Userspace JITs will make use of * *minLine. */ taint |= check_update_ftr_reg(SYS_CTR_EL0, cpu, info->reg_ctr, boot->reg_ctr); /* * Userspace may perform DC ZVA instructions. Mismatched block sizes * could result in too much or too little memory being zeroed if a * process is preempted and migrated between CPUs. */ taint |= check_update_ftr_reg(SYS_DCZID_EL0, cpu, info->reg_dczid, boot->reg_dczid); /* If different, timekeeping will be broken (especially with KVM) */ taint |= check_update_ftr_reg(SYS_CNTFRQ_EL0, cpu, info->reg_cntfrq, boot->reg_cntfrq); /* * The kernel uses self-hosted debug features and expects CPUs to * support identical debug features. We presently need CTX_CMPs, WRPs, * and BRPs to be identical. * ID_AA64DFR1 is currently RES0. */ taint |= check_update_ftr_reg(SYS_ID_AA64DFR0_EL1, cpu, info->reg_id_aa64dfr0, boot->reg_id_aa64dfr0); taint |= check_update_ftr_reg(SYS_ID_AA64DFR1_EL1, cpu, info->reg_id_aa64dfr1, boot->reg_id_aa64dfr1); /* * Even in big.LITTLE, processors should be identical instruction-set * wise. */ taint |= check_update_ftr_reg(SYS_ID_AA64ISAR0_EL1, cpu, info->reg_id_aa64isar0, boot->reg_id_aa64isar0); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu, info->reg_id_aa64isar1, boot->reg_id_aa64isar1); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu, info->reg_id_aa64isar2, boot->reg_id_aa64isar2); taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu, info->reg_id_aa64isar3, boot->reg_id_aa64isar3); /* * Differing PARange support is fine as long as all peripherals and * memory are mapped within the minimum PARange of all CPUs. * Linux should not care about secure memory. */ taint |= check_update_ftr_reg(SYS_ID_AA64MMFR0_EL1, cpu, info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu, info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1); taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu, info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2); taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu, info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0); taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu, info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0); taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu, info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0); /* Probe vector lengths */ if (IS_ENABLED(CONFIG_ARM64_SVE) && id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { if (!system_capabilities_finalized()) { unsigned long cpacr = cpacr_save_enable_kernel_sve(); vec_update_vq_map(ARM64_VEC_SVE); cpacr_restore(cpacr); } } if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); /* Probe vector lengths */ if (!system_capabilities_finalized()) vec_update_vq_map(ARM64_VEC_SME); cpacr_restore(cpacr); } if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the * value is the same on all CPUs. */ if (IS_ENABLED(CONFIG_ARM64_MTE) && id_aa64pfr1_mte(info->reg_id_aa64pfr1)) { taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu, info->reg_gmid, boot->reg_gmid); } /* * If we don't have AArch32 at all then skip the checks entirely * as the register values may be UNKNOWN and we're not going to be * using them for anything. * * This relies on a sanitised view of the AArch64 ID registers * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last. */ if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { lazy_init_32bit_cpu_features(info, boot); taint |= update_32bit_cpu_features(cpu, &info->aarch32, &boot->aarch32); } /* * Mismatched CPU features are a recipe for disaster. Don't even * pretend to support them. */ if (taint) { pr_warn_once("Unsupported CPU feature variation detected.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } } u64 read_sanitised_ftr_reg(u32 id) { struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id); if (!regp) return 0; return regp->sys_val; } EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ case r: val = read_sysreg_s(r); break; /* * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated. * Read the system register on the current CPU */ u64 __read_sysreg_by_encoding(u32 sys_id) { struct arm64_ftr_reg *regp; u64 val; switch (sys_id) { read_sysreg_case(SYS_ID_PFR0_EL1); read_sysreg_case(SYS_ID_PFR1_EL1); read_sysreg_case(SYS_ID_PFR2_EL1); read_sysreg_case(SYS_ID_DFR0_EL1); read_sysreg_case(SYS_ID_DFR1_EL1); read_sysreg_case(SYS_ID_MMFR0_EL1); read_sysreg_case(SYS_ID_MMFR1_EL1); read_sysreg_case(SYS_ID_MMFR2_EL1); read_sysreg_case(SYS_ID_MMFR3_EL1); read_sysreg_case(SYS_ID_MMFR4_EL1); read_sysreg_case(SYS_ID_MMFR5_EL1); read_sysreg_case(SYS_ID_ISAR0_EL1); read_sysreg_case(SYS_ID_ISAR1_EL1); read_sysreg_case(SYS_ID_ISAR2_EL1); read_sysreg_case(SYS_ID_ISAR3_EL1); read_sysreg_case(SYS_ID_ISAR4_EL1); read_sysreg_case(SYS_ID_ISAR5_EL1); read_sysreg_case(SYS_ID_ISAR6_EL1); read_sysreg_case(SYS_MVFR0_EL1); read_sysreg_case(SYS_MVFR1_EL1); read_sysreg_case(SYS_MVFR2_EL1); read_sysreg_case(SYS_ID_AA64PFR0_EL1); read_sysreg_case(SYS_ID_AA64PFR1_EL1); read_sysreg_case(SYS_ID_AA64PFR2_EL1); read_sysreg_case(SYS_ID_AA64ZFR0_EL1); read_sysreg_case(SYS_ID_AA64SMFR0_EL1); read_sysreg_case(SYS_ID_AA64FPFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR0_EL1); read_sysreg_case(SYS_ID_AA64DFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR0_EL1); read_sysreg_case(SYS_ID_AA64MMFR1_EL1); read_sysreg_case(SYS_ID_AA64MMFR2_EL1); read_sysreg_case(SYS_ID_AA64MMFR3_EL1); read_sysreg_case(SYS_ID_AA64MMFR4_EL1); read_sysreg_case(SYS_ID_AA64ISAR0_EL1); read_sysreg_case(SYS_ID_AA64ISAR1_EL1); read_sysreg_case(SYS_ID_AA64ISAR2_EL1); read_sysreg_case(SYS_ID_AA64ISAR3_EL1); read_sysreg_case(SYS_CNTFRQ_EL0); read_sysreg_case(SYS_CTR_EL0); read_sysreg_case(SYS_DCZID_EL0); default: BUG(); return 0; } regp = get_arm64_ftr_reg(sys_id); if (regp) { val &= ~regp->override->mask; val |= (regp->override->val & regp->override->mask); } return val; } #include <linux/irqchip/arm-gic-v3.h> static bool has_always(const struct arm64_cpu_capabilities *entry, int scope) { return true; } static bool feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry) { int val, min, max; u64 tmp; val = cpuid_feature_extract_field_width(reg, entry->field_pos, entry->field_width, entry->sign); tmp = entry->min_field_value; tmp <<= entry->field_pos; min = cpuid_feature_extract_field_width(tmp, entry->field_pos, entry->field_width, entry->sign); tmp = entry->max_field_value; tmp <<= entry->field_pos; max = cpuid_feature_extract_field_width(tmp, entry->field_pos, entry->field_width, entry->sign); return val >= min && val <= max; } static u64 read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope) { WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); if (scope == SCOPE_SYSTEM) return read_sanitised_ftr_reg(entry->sys_reg); else return __read_sysreg_by_encoding(entry->sys_reg); } static bool has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) { int mask; struct arm64_ftr_reg *regp; u64 val = read_scoped_sysreg(entry, scope); regp = get_arm64_ftr_reg(entry->sys_reg); if (!regp) return false; mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask, entry->field_pos, entry->field_width); if (!mask) return false; return feature_matches(val, entry); } static bool has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope) { u64 val = read_scoped_sysreg(entry, scope); return feature_matches(val, entry); } const struct cpumask *system_32bit_el0_cpumask(void) { if (!system_supports_32bit_el0()) return cpu_none_mask; if (static_branch_unlikely(&arm64_mismatched_32bit_el0)) return cpu_32bit_el0_mask; return cpu_possible_mask; } const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) { return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK)); } static int __init parse_32bit_el0_param(char *str) { allow_mismatched_32bit_el0 = true; return 0; } early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param); static ssize_t aarch32_el0_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct cpumask *mask = system_32bit_el0_cpumask(); return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask)); } static const DEVICE_ATTR_RO(aarch32_el0); static int __init aarch32_el0_sysfs_init(void) { struct device *dev_root; int ret = 0; if (!allow_mismatched_32bit_el0) return 0; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { ret = device_create_file(dev_root, &dev_attr_aarch32_el0); put_device(dev_root); } return ret; } device_initcall(aarch32_el0_sysfs_init); static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope) { if (!has_cpuid_feature(entry, scope)) return allow_mismatched_32bit_el0; if (scope == SCOPE_SYSTEM) pr_info("detected: 32-bit EL0 Support\n"); return true; } static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope) { bool has_sre; if (!has_cpuid_feature(entry, scope)) return false; has_sre = gic_enable_sre(); if (!has_sre) pr_warn_once("%s present but disabled by higher exception level\n", entry->desc); return has_sre; } static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, int scope) { u64 ctr; if (scope == SCOPE_SYSTEM) ctr = arm64_ftr_reg_ctrel0.sys_val; else ctr = read_cpuid_effective_cachetype(); return ctr & BIT(CTR_EL0_IDC_SHIFT); } static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused) { /* * If the CPU exposes raw CTR_EL0.IDC = 0, while effectively * CTR_EL0.IDC = 1 (from CLIDR values), we need to trap accesses * to the CTR_EL0 on this CPU and emulate it with the real/safe * value. */ if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT))) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } static bool has_cache_dic(const struct arm64_cpu_capabilities *entry, int scope) { u64 ctr; if (scope == SCOPE_SYSTEM) ctr = arm64_ftr_reg_ctrel0.sys_val; else ctr = read_cpuid_cachetype(); return ctr & BIT(CTR_EL0_DIC_SHIFT); } static bool __maybe_unused has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) { /* * Kdump isn't guaranteed to power-off all secondary CPUs, CNP * may share TLB entries with a CPU stuck in the crashed * kernel. */ if (is_kdump_kernel()) return false; if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); } static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, int scope) { /* List of CPUs that are not vulnerable and don't need KPTI */ static const struct midr_range kpti_safe_list[] = { MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), { /* sentinel */ } }; char const *str = "kpti command line option"; bool meltdown_safe; meltdown_safe = is_midr_in_range_list(kpti_safe_list); /* Defer to CPU feature registers */ if (has_cpuid_feature(entry, scope)) meltdown_safe = true; if (!meltdown_safe) __meltdown_safe = false; /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which * ends as well as you might imagine. Don't even try. We cannot rely * on the cpus_have_*cap() helpers here to detect the CPU erratum * because cpucap detection order may change. However, since we know * affected CPUs are always in a homogeneous configuration, it is * safe to rely on this_cpu_has_cap() here. */ if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } /* Useful for KASLR robustness */ if (kaslr_enabled() && kaslr_requires_kpti()) { if (!__kpti_forced) { str = "KASLR"; __kpti_forced = 1; } } if (cpu_mitigations_off() && !__kpti_forced) { str = "mitigations=off"; __kpti_forced = -1; } if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { pr_info_once("kernel page table isolation disabled by kernel configuration\n"); return false; } /* Forced? */ if (__kpti_forced) { pr_info_once("kernel page table isolation forced %s by %s\n", __kpti_forced > 0 ? "ON" : "OFF", str); return __kpti_forced > 0; } return !meltdown_safe; } static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope) { /* * Although the Apple M2 family appears to support NV1, the * PTW barfs on the nVHE EL2 S1 page table format. Pretend * that it doesn't support NV1 at all. */ static const struct midr_range nv1_ni_list[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {} }; return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) && !(has_cpuid_feature(entry, scope) || is_midr_in_range_list(nv1_ni_list))); } #if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) static bool has_lpa2_at_stage1(u64 mmfr0) { unsigned int tgran; tgran = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_SHIFT); return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; } static bool has_lpa2_at_stage2(u64 mmfr0) { unsigned int tgran; tgran = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; } static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) { u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); } #else static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) { return false; } #endif #ifdef CONFIG_HW_PERF_EVENTS static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) { u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); unsigned int pmuver; /* * PMUVer follows the standard ID scheme for an unsigned field with the * exception of 0xF (IMP_DEF) which is treated specially and implies * FEAT_PMUv3 is not implemented. * * See DDI0487L.a D24.1.3.2 for more details. */ pmuver = cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT); if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) return false; return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP; } #endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) extern void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags); static phys_addr_t __initdata kpti_ng_temp_alloc; static phys_addr_t __init kpti_ng_pgd_alloc(int shift) { kpti_ng_temp_alloc -= PAGE_SIZE; return kpti_ng_temp_alloc; } static int __init __kpti_install_ng_mappings(void *__unused) { typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); extern kpti_remap_fn idmap_kpti_install_ng_mappings; kpti_remap_fn *remap_fn; int cpu = smp_processor_id(); int levels = CONFIG_PGTABLE_LEVELS; int order = order_base_2(levels); u64 kpti_ng_temp_pgd_pa = 0; pgd_t *kpti_ng_temp_pgd; u64 alloc = 0; if (levels == 5 && !pgtable_l5_enabled()) levels = 4; else if (levels == 4 && !pgtable_l4_enabled()) levels = 3; remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); if (!cpu) { alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); // // Create a minimal page table hierarchy that permits us to map // the swapper page tables temporarily as we traverse them. // // The physical pages are laid out as follows: // // +--------+-/-------+-/------ +-/------ +-\\\--------+ // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : // +--------+-\-------+-\------ +-\------ +-///--------+ // ^ // The first page is mapped into this hierarchy at a PMD_SHIFT // aligned virtual address, so that we can manipulate the PTE // level entries while the mapping is active. The first entry // covers the PTE[] page itself, the remaining entries are free // to be used as a ad-hoc fixmap. // create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, kpti_ng_pgd_alloc, 0); } cpu_install_idmap(); remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); cpu_uninstall_idmap(); if (!cpu) { free_pages(alloc, order); arm64_use_ng_mappings = true; } return 0; } static void __init kpti_install_ng_mappings(void) { /* Check whether KPTI is going to be used */ if (!arm64_kernel_unmapped_at_el0()) return; /* * We don't need to rewrite the page-tables if either we've done * it already or we have KASLR enabled and therefore have not * created any global mappings at all. */ if (arm64_use_ng_mappings) return; stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); } #else static inline void kpti_install_ng_mappings(void) { } #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) { if (__this_cpu_read(this_cpu_vector) == vectors) { const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI); __this_cpu_write(this_cpu_vector, v); } } static int __init parse_kpti(char *str) { bool enabled; int ret = kstrtobool(str, &enabled); if (ret) return ret; __kpti_forced = enabled ? 1 : -1; return 0; } early_param("kpti", parse_kpti); #ifdef CONFIG_ARM64_HW_AFDBM static struct cpumask dbm_cpus __read_mostly; static inline void __cpu_enable_hw_dbm(void) { u64 tcr = read_sysreg(tcr_el1) | TCR_HD; write_sysreg(tcr, tcr_el1); isb(); local_flush_tlb_all(); } static bool cpu_has_broken_dbm(void) { /* List of CPUs which have broken DBM support. */ static const struct midr_range cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_1024718 MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), /* Kryo4xx Silver (rdpe => r1p0) */ MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe), #endif #ifdef CONFIG_ARM64_ERRATUM_2051678 MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), #endif {}, }; return is_midr_in_range_list(cpus); } static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap) { return has_cpuid_feature(cap, SCOPE_LOCAL_CPU) && !cpu_has_broken_dbm(); } static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap) { if (cpu_can_use_dbm(cap)) { __cpu_enable_hw_dbm(); cpumask_set_cpu(smp_processor_id(), &dbm_cpus); } } static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, int __unused) { /* * DBM is a non-conflicting feature. i.e, the kernel can safely * run a mix of CPUs with and without the feature. So, we * unconditionally enable the capability to allow any late CPU * to use the feature. We only enable the control bits on the * CPU, if it is supported. */ return true; } #endif #ifdef CONFIG_ARM64_AMU_EXTN /* * The "amu_cpus" cpumask only signals that the CPU implementation for the * flagged CPUs supports the Activity Monitors Unit (AMU) but does not provide * information regarding all the events that it supports. When a CPU bit is * set in the cpumask, the user of this feature can only rely on the presence * of the 4 fixed counters for that CPU. But this does not guarantee that the * counters are enabled or access to these counters is enabled by code * executed at higher exception levels (firmware). */ static struct cpumask amu_cpus __read_mostly; bool cpu_has_amu_feat(int cpu) { return cpumask_test_cpu(cpu, &amu_cpus); } int get_cpu_with_amu_feat(void) { return cpumask_any(&amu_cpus); } static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) { if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) { cpumask_set_cpu(smp_processor_id(), &amu_cpus); /* 0 reference values signal broken/disabled counters */ if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168)) update_freq_counters_refs(); } } static bool has_amu(const struct arm64_cpu_capabilities *cap, int __unused) { /* * The AMU extension is a non-conflicting feature: the kernel can * safely run a mix of CPUs with and without support for the * activity monitors extension. Therefore, unconditionally enable * the capability to allow any late CPU to use the feature. * * With this feature unconditionally enabled, the cpu_enable * function will be called for all CPUs that match the criteria, * including secondary and hotplugged, marking this feature as * present on that respective CPU. The enable function will also * print a detection message. */ return true; } #else int get_cpu_with_amu_feat(void) { return nr_cpu_ids; } #endif static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) { return is_kernel_in_hyp_mode(); } static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused) { /* * Copy register values that aren't redirected by hardware. * * Before code patching, we only set tpidr_el1, all CPUs need to copy * this value to tpidr_el2 before we patch the code. Once we've done * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to * do anything here. */ if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN)) write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); } static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, int scope) { if (kvm_get_mode() != KVM_MODE_NV) return false; if (!cpucap_multi_entry_cap_matches(cap, scope)) { pr_warn("unavailable: %s\n", cap->desc); return false; } return true; } static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, int __unused) { return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); } #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { /* * We modify PSTATE. This won't work from irq context as the PSTATE * is discarded once we return from the exception. */ WARN_ON_ONCE(in_interrupt()); sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); set_pstate_pan(1); } #endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_RAS_EXTN static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) { /* Firmware may have left a deferred SError in this register. */ write_sysreg_s(0, SYS_DISR_EL1); } #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope) { int boot_val, sec_val; /* We don't expect to be called with SCOPE_SYSTEM */ WARN_ON(scope == SCOPE_SYSTEM); /* * The ptr-auth feature levels are not intercompatible with lower * levels. Hence we must match ptr-auth feature level of the secondary * CPUs with that of the boot CPU. The level of boot cpu is fetched * from the sanitised register whereas direct register read is done for * the secondary CPUs. * The sanitised feature state is guaranteed to match that of the * boot CPU as a mismatched secondary CPU is parked before it gets * a chance to update the state, with the capability. */ boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg), entry->field_pos, entry->sign); if (scope & SCOPE_BOOT_CPU) return boot_val >= entry->min_field_value; /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), entry->field_pos, entry->sign); return (sec_val >= entry->min_field_value) && (sec_val == boot_val); } static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, int scope) { bool api = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); bool apa = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope); bool apa3 = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope); return apa || apa3 || api; } static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, int __unused) { bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF); bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5); bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3); return gpa || gpa3 || gpi; } #endif /* CONFIG_ARM64_PTR_AUTH */ #ifdef CONFIG_ARM64_E0PD static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) { if (this_cpu_has_cap(ARM64_HAS_E0PD)) sysreg_clear_set(tcr_el1, 0, TCR_E0PD1); } #endif /* CONFIG_ARM64_E0PD */ #ifdef CONFIG_ARM64_PSEUDO_NMI static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry, int scope) { /* * ARM64_HAS_GICV3_CPUIF has a lower index, and is a boot CPU * feature, so will be detected earlier. */ BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GICV3_CPUIF); if (!cpus_have_cap(ARM64_HAS_GICV3_CPUIF)) return false; return enable_pseudo_nmi; } static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry, int scope) { /* * If we're not using priority masking then we won't be poking PMR_EL1, * and there's no need to relax synchronization of writes to it, and * ICC_CTLR_EL1 might not be accessible and we must avoid reads from * that. * * ARM64_HAS_GIC_PRIO_MASKING has a lower index, and is a boot CPU * feature, so will be detected earlier. */ BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_RELAXED_SYNC <= ARM64_HAS_GIC_PRIO_MASKING); if (!cpus_have_cap(ARM64_HAS_GIC_PRIO_MASKING)) return false; /* * When Priority Mask Hint Enable (PMHE) == 0b0, PMR is not used as a * hint for interrupt distribution, a DSB is not necessary when * unmasking IRQs via PMR, and we can relax the barrier to a NOP. * * Linux itself doesn't use 1:N distribution, so has no need to * set PMHE. The only reason to have it set is if EL3 requires it * (and we can't change it). */ return (gic_read_ctlr() & ICC_CTLR_EL1_PMHE_MASK) == 0; } #endif #ifdef CONFIG_ARM64_BTI static void bti_enable(const struct arm64_cpu_capabilities *__unused) { /* * Use of X16/X17 for tail-calls and trampolines that jump to * function entry points using BR is a requirement for * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI. * So, be strict and forbid other BRs using other registers to * jump onto a PACIxSP instruction: */ sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1); isb(); } #endif /* CONFIG_ARM64_BTI */ #ifdef CONFIG_ARM64_MTE static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) { sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); mte_cpu_setup(); /* * Clear the tags in the zero page. This needs to be done via the * linear map which has the Tagged attribute. */ if (try_page_mte_tagging(ZERO_PAGE(0))) { mte_clear_page_tags(lm_alias(empty_zero_page)); set_page_mte_tagged(ZERO_PAGE(0)); } kasan_init_hw_tags_cpu(); } #endif /* CONFIG_ARM64_MTE */ static void user_feature_fixup(void) { if (cpus_have_cap(ARM64_WORKAROUND_2658417)) { struct arm64_ftr_reg *regp; regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1); if (regp) regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK; } if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_SSBS)) { struct arm64_ftr_reg *regp; regp = get_arm64_ftr_reg(SYS_ID_AA64PFR1_EL1); if (regp) regp->user_mask &= ~ID_AA64PFR1_EL1_SSBS_MASK; } } static void elf_hwcap_fixup(void) { #ifdef CONFIG_COMPAT if (cpus_have_cap(ARM64_WORKAROUND_1742098)) compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES; #endif /* CONFIG_COMPAT */ } #ifdef CONFIG_KVM static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused) { return kvm_get_mode() == KVM_MODE_PROTECTED; } #endif /* CONFIG_KVM */ static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused) { sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP); } static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused) { set_pstate_dit(1); } static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused) { sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn); } #ifdef CONFIG_ARM64_POE static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) { sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE); sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE); } #endif #ifdef CONFIG_ARM64_GCS static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) { /* GCSPR_EL0 is always readable */ write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); } #endif /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) { return !!(cap->type & ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU); } static bool cpucap_late_cpu_permitted(const struct arm64_cpu_capabilities *cap) { return !!(cap->type & ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU); } static bool cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) { return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } static bool test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) { if (!has_cpuid_feature(entry, scope)) return false; /* Check firmware actually enabled MPAM on this cpu. */ return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); } static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { /* * Access by the kernel (at EL1) should use the reserved PARTID * which is configured unrestricted. This avoids priority-inversion * where latency sensitive tasks have to wait for a task that has * been throttled to release the lock. */ write_sysreg_s(0, SYS_MPAM1_EL1); } static bool test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) { u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); return idr & MPAMIDR_EL1_HAS_HCR; } static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_always, }, { .capability = ARM64_ALWAYS_SYSTEM, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_always, }, { .desc = "GICv3 CPU interface", .capability = ARM64_HAS_GICV3_CPUIF, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_useable_gicv3_cpuif, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP) }, { .desc = "Enhanced Counter Virtualization", .capability = ARM64_HAS_ECV, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, IMP) }, { .desc = "Enhanced Counter Virtualization (CNTPOFF)", .capability = ARM64_HAS_ECV_CNTPOFF, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, CNTPOFF) }, #ifdef CONFIG_ARM64_PAN { .desc = "Privileged Access Never", .capability = ARM64_HAS_PAN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_pan, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, IMP) }, #endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_EPAN { .desc = "Enhanced Privileged Access Never", .capability = ARM64_HAS_EPAN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, PAN3) }, #endif /* CONFIG_ARM64_EPAN */ #ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", .capability = ARM64_HAS_LSE_ATOMICS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP) }, #endif /* CONFIG_ARM64_LSE_ATOMICS */ { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = runs_at_el2, .cpu_enable = cpu_copy_el2regs, }, { .desc = "Nested Virtualization Support", .capability = ARM64_HAS_NESTED_VIRT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nested_virt_support, .match_list = (const struct arm64_cpu_capabilities []){ { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2) }, { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) }, { /* Sentinel */ } }, }, { .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_32bit_el0, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL0, AARCH32) }, #ifdef CONFIG_KVM { .desc = "32-bit EL1 Support", .capability = ARM64_HAS_32BIT_EL1, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL1, AARCH32) }, { .desc = "Protected KVM", .capability = ARM64_KVM_PROTECTED_MODE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = is_kvm_protected_mode, }, { .desc = "HCRX_EL2 register", .capability = ARM64_HAS_HCX, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HCX, IMP) }, #endif { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE, .cpu_enable = cpu_enable_kpti, .matches = unmap_kernel_at_el0, /* * The ID feature fields below are used to indicate that * the CPU doesn't need KPTI. See unmap_kernel_at_el0 for * more details. */ ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP) }, { .capability = ARM64_HAS_FPSIMD, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_fpsimd, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP) }, #ifdef CONFIG_ARM64_PMEM { .desc = "Data cache clean to Point of Persistence", .capability = ARM64_HAS_DCPOP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, IMP) }, { .desc = "Data cache clean to Point of Deep Persistence", .capability = ARM64_HAS_DCPODP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, DPB2) }, #endif #ifdef CONFIG_ARM64_SVE { .desc = "Scalable Vector Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SVE, .cpu_enable = cpu_enable_sve, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP) }, #endif /* CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_RAS_EXTN { .desc = "RAS Extension Support", .capability = ARM64_HAS_RAS_EXTN, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_clear_disr, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP) }, #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_AMU_EXTN { .desc = "Activity Monitors Unit (AMU)", .capability = ARM64_HAS_AMU_EXTN, .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .matches = has_amu, .cpu_enable = cpu_amu_enable, .cpus = &amu_cpus, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP) }, #endif /* CONFIG_ARM64_AMU_EXTN */ { .desc = "Data cache clean to the PoU not required for I/D coherence", .capability = ARM64_HAS_CACHE_IDC, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cache_idc, .cpu_enable = cpu_emulate_effective_ctr, }, { .desc = "Instruction cache invalidation not required for I/D coherence", .capability = ARM64_HAS_CACHE_DIC, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cache_dic, }, { .desc = "Stage-2 Force Write-Back", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_STAGE2_FWB, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, FWB, IMP) }, { .desc = "ARMv8.4 Translation Table Level", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_ARMv8_4_TTL, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, TTL, IMP) }, { .desc = "TLB range maintenance instructions", .capability = ARM64_HAS_TLB_RANGE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, TLB, RANGE) }, #ifdef CONFIG_ARM64_HW_AFDBM { .desc = "Hardware dirty bit management", .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .capability = ARM64_HW_DBM, .matches = has_hw_dbm, .cpu_enable = cpu_enable_hw_dbm, .cpus = &dbm_cpus, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, #endif #ifdef CONFIG_ARM64_HAFT { .desc = "Hardware managed Access Flag for Table Descriptors", /* * Contrary to the page/block access flag, the table access flag * cannot be emulated in software (no access fault will occur). * Therefore this should be used only if it's supported system * wide. */ .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAFT, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) }, #endif { .desc = "CRC32 instructions", .capability = ARM64_HAS_CRC32, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, CRC32, IMP) }, { .desc = "Speculative Store Bypassing Safe (SSBS)", .capability = ARM64_SSBS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SSBS, IMP) }, #ifdef CONFIG_ARM64_CNP { .desc = "Common not Private translations", .capability = ARM64_HAS_CNP, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_useable_cnp, .cpu_enable = cpu_enable_cnp, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, CnP, IMP) }, #endif { .desc = "Speculation barrier (SB)", .capability = ARM64_HAS_SB, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, SB, IMP) }, #ifdef CONFIG_ARM64_PTR_AUTH { .desc = "Address authentication (architected QARMA5 algorithm)", .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_address_auth_cpucap, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, APA, PAuth) }, { .desc = "Address authentication (architected QARMA3 algorithm)", .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_address_auth_cpucap, ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, APA3, PAuth) }, { .desc = "Address authentication (IMP DEF algorithm)", .capability = ARM64_HAS_ADDRESS_AUTH_IMP_DEF, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_address_auth_cpucap, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, API, PAuth) }, { .capability = ARM64_HAS_ADDRESS_AUTH, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_address_auth_metacap, }, { .desc = "Generic authentication (architected QARMA5 algorithm)", .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPA, IMP) }, { .desc = "Generic authentication (architected QARMA3 algorithm)", .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, GPA3, IMP) }, { .desc = "Generic authentication (IMP DEF algorithm)", .capability = ARM64_HAS_GENERIC_AUTH_IMP_DEF, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPI, IMP) }, { .capability = ARM64_HAS_GENERIC_AUTH, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_generic_auth, }, #endif /* CONFIG_ARM64_PTR_AUTH */ #ifdef CONFIG_ARM64_PSEUDO_NMI { /* * Depends on having GICv3 */ .desc = "IRQ priority masking", .capability = ARM64_HAS_GIC_PRIO_MASKING, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = can_use_gic_priorities, }, { /* * Depends on ARM64_HAS_GIC_PRIO_MASKING */ .capability = ARM64_HAS_GIC_PRIO_RELAXED_SYNC, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_gic_prio_relaxed_sync, }, #endif #ifdef CONFIG_ARM64_E0PD { .desc = "E0PD", .capability = ARM64_HAS_E0PD, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .cpu_enable = cpu_enable_e0pd, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, E0PD, IMP) }, #endif { .desc = "Random Number Generator", .capability = ARM64_HAS_RNG, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, RNDR, IMP) }, #ifdef CONFIG_ARM64_BTI { .desc = "Branch Target Identification", .capability = ARM64_BTI, #ifdef CONFIG_ARM64_BTI_KERNEL .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, #else .type = ARM64_CPUCAP_SYSTEM_FEATURE, #endif .matches = has_cpuid_feature, .cpu_enable = bti_enable, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, BT, IMP) }, #endif #ifdef CONFIG_ARM64_MTE { .desc = "Memory Tagging Extension", .capability = ARM64_MTE, .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_mte, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE2) }, { .desc = "Asymmetric MTE Tag Check Fault", .capability = ARM64_MTE_ASYMM, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3) }, #endif /* CONFIG_ARM64_MTE */ { .desc = "RCpc load-acquire (LDAPR)", .capability = ARM64_HAS_LDAPR, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LRCPC, IMP) }, { .desc = "Fine Grained Traps", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_FGT, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP) }, { .desc = "Fine Grained Traps 2", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_FGT2, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_sme, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP) }, /* FA64 should be sorted after the base SME capability */ { .desc = "FA64", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME_FA64, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_fa64, ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP) }, { .desc = "SME2", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_SME2, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_sme2, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2) }, #endif /* CONFIG_ARM64_SME */ { .desc = "WFx with timeout", .capability = ARM64_HAS_WFXT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, WFxT, IMP) }, { .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality", .capability = ARM64_HAS_TIDCP1, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_trap_el0_impdef, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, TIDCP1, IMP) }, { .desc = "Data independent timing control (DIT)", .capability = ARM64_HAS_DIT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_dit, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) }, { .desc = "Memory Copy and Memory Set instructions", .capability = ARM64_HAS_MOPS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_mops, ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, MOPS, IMP) }, { .capability = ARM64_HAS_TCR2, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, TCRX, IMP) }, { .desc = "Stage-1 Permission Indirection Extension (S1PIE)", .capability = ARM64_HAS_S1PIE, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1PIE, IMP) }, { .desc = "VHE for hypervisor only", .capability = ARM64_KVM_HVHE, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = hvhe_possible, }, { .desc = "Enhanced Virtualization Traps", .capability = ARM64_HAS_EVT, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, { .desc = "52-bit Virtual Addressing for KVM (LPA2)", .capability = ARM64_HAS_LPA2, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_lpa2, }, { .desc = "FPMR", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_HAS_FPMR, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_fpmr, ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP) }, #ifdef CONFIG_ARM64_VA_BITS_52 { .capability = ARM64_HAS_VA52, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, #ifdef CONFIG_ARM64_64K_PAGES .desc = "52-bit Virtual Addressing (LVA)", ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52) #else .desc = "52-bit Virtual Addressing (LPA2)", #ifdef CONFIG_ARM64_4K_PAGES ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT) #else ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT) #endif #endif }, #endif { .desc = "Memory Partitioning And Monitoring", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_MPAM, .matches = test_has_mpam, .cpu_enable = cpu_enable_mpam, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) }, { .desc = "Memory Partitioning And Monitoring Virtualisation", .type = ARM64_CPUCAP_SYSTEM_FEATURE, .capability = ARM64_MPAM_HCR, .matches = test_has_mpam_hcr, }, { .desc = "NV1", .capability = ARM64_HAS_HCR_NV1, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_nv1, ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1) }, #ifdef CONFIG_ARM64_POE { .desc = "Stage-1 Permission Overlay Extension (S1POE)", .capability = ARM64_HAS_S1POE, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, #endif #ifdef CONFIG_ARM64_GCS { .desc = "Guarded Control Stack (GCS)", .capability = ARM64_HAS_GCS, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .cpu_enable = cpu_enable_gcs, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) }, #endif #ifdef CONFIG_HW_PERF_EVENTS { .desc = "PMUv3", .capability = ARM64_HAS_PMUV3, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_pmuv3, }, #endif { .desc = "SCTLR2", .capability = ARM64_HAS_SCTLR2, .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, SCTLRX, IMP) }, { .desc = "GICv5 CPU interface", .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, .capability = ARM64_HAS_GICV5_CPUIF, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, GCIE, IMP) }, {}, }; #define HWCAP_CPUID_MATCH(reg, field, min_value) \ .matches = has_user_cpuid_feature, \ ARM64_CPUID_FIELDS(reg, field, min_value) #define __HWCAP_CAP(name, cap_type, cap) \ .desc = name, \ .type = ARM64_CPUCAP_SYSTEM_FEATURE, \ .hwcap_type = cap_type, \ .hwcap = cap, \ #define HWCAP_CAP(reg, field, min_value, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ HWCAP_CPUID_MATCH(reg, field, min_value) \ } #define HWCAP_MULTI_CAP(list, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ .matches = cpucap_multi_entry_cap_matches, \ .match_list = list, \ } #define HWCAP_CAP_MATCH(match, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ .matches = match, \ } #define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap) \ { \ __HWCAP_CAP(#cap, cap_type, cap) \ HWCAP_CPUID_MATCH(reg, field, min_value) \ .matches = match, \ } #ifdef CONFIG_ARM64_PTR_AUTH static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = { { HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, APA, PAuth) }, { HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, APA3, PAuth) }, { HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, API, PAuth) }, {}, }; static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { { HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPA, IMP) }, { HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, GPA3, IMP) }, { HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPI, IMP) }, {}, }; #endif #ifdef CONFIG_ARM64_SVE static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope) { return system_supports_sve() && has_user_cpuid_feature(cap, scope); } #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL), HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES), HWCAP_CAP(ID_AA64ISAR0_EL1, SHA1, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA1), HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA256, CAP_HWCAP, KERNEL_HWCAP_SHA2), HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512), HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32), HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128), HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3), HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3), HWCAP_CAP(ID_AA64ISAR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SM4), HWCAP_CAP(ID_AA64ISAR0_EL1, DP, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT), HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP), HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT), HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA), HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC), HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3), HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT), HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB), HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16), HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16), HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH), HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM), HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT), HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX), HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM), HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM), #endif #ifdef CONFIG_ARM64_GCS HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI HWCAP_CAP(ID_AA64PFR1_EL1, BT, IMP, CAP_HWCAP, KERNEL_HWCAP_BTI), #endif #ifdef CONFIG_ARM64_PTR_AUTH HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif #ifdef CONFIG_ARM64_MTE HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE), HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3), #endif /* CONFIG_ARM64_MTE */ HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV), HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP), HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC), HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM), HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES), HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT), HWCAP_CAP(ID_AA64ISAR2_EL1, MOPS, IMP, CAP_HWCAP, KERNEL_HWCAP_MOPS), HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC), #ifdef CONFIG_ARM64_SME HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME), HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64), HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1), HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2), HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64), HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64), HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32), HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16), HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16), HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16), HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32), HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32), HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32), HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32), HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32), HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP), HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4), #endif /* CONFIG_ARM64_SME */ HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT), HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE), #endif {}, }; #ifdef CONFIG_COMPAT static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope) { /* * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available, * in line with that of arm32 as in vfp_init(). We make sure that the * check is future proof, by making sure value is non-zero. */ u32 mvfr1; WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible()); if (scope == SCOPE_SYSTEM) mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1); else mvfr1 = read_sysreg_s(SYS_MVFR1_EL1); return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDSP_SHIFT) && cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDInt_SHIFT) && cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDLS_SHIFT); } #endif static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = { #ifdef CONFIG_COMPAT HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON), HWCAP_CAP(MVFR1_EL1, SIMDFMAC, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4), /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */ HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP), HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3), HWCAP_CAP(MVFR1_EL1, FPHP, FP16, CAP_COMPAT_HWCAP, COMPAT_HWCAP_FPHP), HWCAP_CAP(MVFR1_EL1, SIMDHP, SIMDHP_FLOAT, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDHP), HWCAP_CAP(ID_ISAR5_EL1, AES, VMULL, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL), HWCAP_CAP(ID_ISAR5_EL1, AES, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES), HWCAP_CAP(ID_ISAR5_EL1, SHA1, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1), HWCAP_CAP(ID_ISAR5_EL1, SHA2, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2), HWCAP_CAP(ID_ISAR5_EL1, CRC32, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32), HWCAP_CAP(ID_ISAR6_EL1, DP, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDDP), HWCAP_CAP(ID_ISAR6_EL1, FHM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDFHM), HWCAP_CAP(ID_ISAR6_EL1, SB, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SB), HWCAP_CAP(ID_ISAR6_EL1, BF16, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDBF16), HWCAP_CAP(ID_ISAR6_EL1, I8MM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_I8MM), HWCAP_CAP(ID_PFR2_EL1, SSBS, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SSBS), #endif {}, }; static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: cpu_set_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: compat_elf_hwcap |= (u32)cap->hwcap; break; case CAP_COMPAT_HWCAP2: compat_elf_hwcap2 |= (u32)cap->hwcap; break; #endif default: WARN_ON(1); break; } } /* Check if we have a particular HWCAP enabled */ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) { bool rc; switch (cap->hwcap_type) { case CAP_HWCAP: rc = cpu_have_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: rc = (compat_elf_hwcap & (u32)cap->hwcap) != 0; break; case CAP_COMPAT_HWCAP2: rc = (compat_elf_hwcap2 & (u32)cap->hwcap) != 0; break; #endif default: WARN_ON(1); rc = false; } return rc; } static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ cpu_set_named_feature(CPUID); for (; hwcaps->matches; hwcaps++) if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps))) cap_set_elf_hwcap(hwcaps); } static void update_cpu_capabilities(u16 scope_mask) { int i; const struct arm64_cpu_capabilities *caps; scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask) || cpus_have_cap(caps->capability) || !caps->matches(caps, cpucap_default_scope(caps))) continue; if (caps->desc && !caps->cpus) pr_info("detected: %s\n", caps->desc); __set_bit(caps->capability, system_cpucaps); if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU)) set_bit(caps->capability, boot_cpucaps); } } /* * Enable all the available capabilities on this CPU. The capabilities * with BOOT_CPU scope are handled separately and hence skipped here. */ static int cpu_enable_non_boot_scope_capabilities(void *__unused) { int i; u16 non_boot_scope = SCOPE_ALL & ~SCOPE_BOOT_CPU; for_each_available_cap(i) { const struct arm64_cpu_capabilities *cap = cpucap_ptrs[i]; if (WARN_ON(!cap)) continue; if (!(cap->type & non_boot_scope)) continue; if (cap->cpu_enable) cap->cpu_enable(cap); } return 0; } /* * Run through the enabled capabilities and enable() it on all active * CPUs */ static void __init enable_cpu_capabilities(u16 scope_mask) { int i; const struct arm64_cpu_capabilities *caps; bool boot_scope; scope_mask &= ARM64_CPUCAP_SCOPE_MASK; boot_scope = !!(scope_mask & SCOPE_BOOT_CPU); for (i = 0; i < ARM64_NCAPS; i++) { caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask) || !cpus_have_cap(caps->capability)) continue; if (boot_scope && caps->cpu_enable) /* * Capabilities with SCOPE_BOOT_CPU scope are finalised * before any secondary CPU boots. Thus, each secondary * will enable the capability as appropriate via * check_local_cpu_capabilities(). The only exception is * the boot CPU, for which the capability must be * enabled here. This approach avoids costly * stop_machine() calls for this case. */ caps->cpu_enable(caps); } /* * For all non-boot scope capabilities, use stop_machine() * as it schedules the work allowing us to modify PSTATE, * instead of on_each_cpu() which uses an IPI, giving us a * PSTATE that disappears when we return. */ if (!boot_scope) stop_machine(cpu_enable_non_boot_scope_capabilities, NULL, cpu_online_mask); } /* * Run through the list of capabilities to check for conflicts. * If the system has already detected a capability, take necessary * action on this CPU. */ static void verify_local_cpu_caps(u16 scope_mask) { int i; bool cpu_has_cap, system_has_cap; const struct arm64_cpu_capabilities *caps; scope_mask &= ARM64_CPUCAP_SCOPE_MASK; for (i = 0; i < ARM64_NCAPS; i++) { caps = cpucap_ptrs[i]; if (!caps || !(caps->type & scope_mask)) continue; cpu_has_cap = caps->matches(caps, SCOPE_LOCAL_CPU); system_has_cap = cpus_have_cap(caps->capability); if (system_has_cap) { /* * Check if the new CPU misses an advertised feature, * which is not safe to miss. */ if (!cpu_has_cap && !cpucap_late_cpu_optional(caps)) break; /* * We have to issue cpu_enable() irrespective of * whether the CPU has it or not, as it is enabeld * system wide. It is upto the call back to take * appropriate action on this CPU. */ if (caps->cpu_enable) caps->cpu_enable(caps); } else { /* * Check if the CPU has this capability if it isn't * safe to have when the system doesn't. */ if (cpu_has_cap && !cpucap_late_cpu_permitted(caps)) break; } } if (i < ARM64_NCAPS) { pr_crit("CPU%d: Detected conflict for capability %d (%s), System: %d, CPU: %d\n", smp_processor_id(), caps->capability, caps->desc, system_has_cap, cpu_has_cap); if (cpucap_panic_on_conflict(caps)) cpu_panic_kernel(); else cpu_die_early(); } } /* * Check for CPU features that are used in early boot * based on the Boot CPU value. */ static void check_early_cpu_features(void) { verify_cpu_asid_bits(); verify_local_cpu_caps(SCOPE_BOOT_CPU); } static void __verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) { for (; caps->matches; caps++) if (cpus_have_elf_hwcap(caps) && !caps->matches(caps, SCOPE_LOCAL_CPU)) { pr_crit("CPU%d: missing HWCAP: %s\n", smp_processor_id(), caps->desc); cpu_die_early(); } } static void verify_local_elf_hwcaps(void) { __verify_local_elf_hwcaps(arm64_elf_hwcaps); if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1))) __verify_local_elf_hwcaps(compat_elf_hwcaps); } static void verify_sve_features(void) { unsigned long cpacr = cpacr_save_enable_kernel_sve(); if (vec_verify_vq_map(ARM64_VEC_SVE)) { pr_crit("CPU%d: SVE: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } cpacr_restore(cpacr); } static void verify_sme_features(void) { unsigned long cpacr = cpacr_save_enable_kernel_sme(); if (vec_verify_vq_map(ARM64_VEC_SME)) { pr_crit("CPU%d: SME: vector length support mismatch\n", smp_processor_id()); cpu_die_early(); } cpacr_restore(cpacr); } static void verify_hyp_capabilities(void) { u64 safe_mmfr1, mmfr0, mmfr1; int parange, ipa_max; unsigned int safe_vmid_bits, vmid_bits; if (!IS_ENABLED(CONFIG_KVM)) return; safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); /* Verify VMID bits */ safe_vmid_bits = get_vmid_bits(safe_mmfr1); vmid_bits = get_vmid_bits(mmfr1); if (vmid_bits < safe_vmid_bits) { pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id()); cpu_die_early(); } /* Verify IPA range */ parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange); if (ipa_max < get_kvm_ipa_limit()) { pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id()); cpu_die_early(); } } static void verify_mpam_capabilities(void) { u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); cpu_die_early(); } cpu_idr = read_cpuid(MPAMIDR_EL1); sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); cpu_die_early(); } cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); cpu_die_early(); } } /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. * Any new CPU should match the system wide status of the capability. If the * new CPU doesn't have a capability which the system now has enabled, we * cannot do anything to fix it up and could cause unexpected failures. So * we park the CPU. */ static void verify_local_cpu_capabilities(void) { /* * The capabilities with SCOPE_BOOT_CPU are checked from * check_early_cpu_features(), as they need to be verified * on all secondary CPUs. */ verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU); verify_local_elf_hwcaps(); if (system_supports_sve()) verify_sve_features(); if (system_supports_sme()) verify_sme_features(); if (is_hyp_mode_available()) verify_hyp_capabilities(); if (system_supports_mpam()) verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) { /* * All secondary CPUs should conform to the early CPU features * in use by the kernel based on boot CPU. */ check_early_cpu_features(); /* * If we haven't finalised the system capabilities, this CPU gets * a chance to update the errata work arounds and local features. * Otherwise, this CPU should verify that it has all the system * advertised capabilities. */ if (!system_capabilities_finalized()) update_cpu_capabilities(SCOPE_LOCAL_CPU); else verify_local_cpu_capabilities(); } bool this_cpu_has_cap(unsigned int n) { if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; if (cap) return cap->matches(cap, SCOPE_LOCAL_CPU); } return false; } EXPORT_SYMBOL_GPL(this_cpu_has_cap); /* * This helper function is used in a narrow window when, * - The system wide safe registers are set with all the SMP CPUs and, * - The SYSTEM_FEATURE system_cpucaps may not have been set. */ static bool __maybe_unused __system_matches_cap(unsigned int n) { if (n < ARM64_NCAPS) { const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n]; if (cap) return cap->matches(cap, SCOPE_SYSTEM); } return false; } void cpu_set_feature(unsigned int num) { set_bit(num, elf_hwcap); } bool cpu_have_feature(unsigned int num) { return test_bit(num, elf_hwcap); } EXPORT_SYMBOL_GPL(cpu_have_feature); unsigned long cpu_get_elf_hwcap(void) { /* * We currently only populate the first 32 bits of AT_HWCAP. Please * note that for userspace compatibility we guarantee that bits 62 * and 63 will always be returned as 0. */ return elf_hwcap[0]; } unsigned long cpu_get_elf_hwcap2(void) { return elf_hwcap[1]; } unsigned long cpu_get_elf_hwcap3(void) { return elf_hwcap[2]; } static void __init setup_boot_cpu_capabilities(void) { kvm_arm_target_impl_cpu_init(); /* * The boot CPU's feature register values have been recorded. Detect * boot cpucaps and local cpucaps for the boot CPU, then enable and * patch alternatives for the available boot cpucaps. */ update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); enable_cpu_capabilities(SCOPE_BOOT_CPU); apply_boot_alternatives(); } void __init setup_boot_cpu_features(void) { /* * Initialize the indirect array of CPU capabilities pointers before we * handle the boot CPU. */ init_cpucap_indirect_list(); /* * Detect broken pseudo-NMI. Must be called _before_ the call to * setup_boot_cpu_capabilities() since it interacts with * can_use_gic_priorities(). */ detect_system_supports_pseudo_nmi(); setup_boot_cpu_capabilities(); } static void __init setup_system_capabilities(void) { /* * The system-wide safe feature register values have been finalized. * Detect, enable, and patch alternatives for the available system * cpucaps. */ update_cpu_capabilities(SCOPE_SYSTEM); enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); apply_alternatives_all(); /* * Log any cpucaps with a cpumask as these aren't logged by * update_cpu_capabilities(). */ for (int i = 0; i < ARM64_NCAPS; i++) { const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; if (caps && caps->cpus && caps->desc && cpumask_any(caps->cpus) < nr_cpu_ids) pr_info("detected: %s on CPU%*pbl\n", caps->desc, cpumask_pr_args(caps->cpus)); } /* * TTBR0 PAN doesn't have its own cpucap, so log it manually. */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); } void __init setup_system_features(void) { setup_system_capabilities(); kpti_install_ng_mappings(); sve_setup(); sme_setup(); /* * Check for sane CTR_EL0.CWG value. */ if (!cache_type_cwg()) pr_warn("No Cache Writeback Granule information, assuming %d\n", ARCH_DMA_MINALIGN); } void __init setup_user_features(void) { user_feature_fixup(); setup_elf_hwcaps(arm64_elf_hwcaps); if (system_supports_32bit_el0()) { setup_elf_hwcaps(compat_elf_hwcaps); elf_hwcap_fixup(); } minsigstksz_setup(); } static int enable_mismatched_32bit_el0(unsigned int cpu) { /* * The first 32-bit-capable CPU we detected and so can no longer * be offlined by userspace. -1 indicates we haven't yet onlined * a 32-bit-capable CPU. */ static int lucky_winner = -1; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); bool cpu_32bit = false; if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) { if (!housekeeping_cpu(cpu, HK_TYPE_TICK)) pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu); else cpu_32bit = true; } if (cpu_32bit) { cpumask_set_cpu(cpu, cpu_32bit_el0_mask); static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0); } if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit) return 0; if (lucky_winner >= 0) return 0; /* * We've detected a mismatch. We need to keep one of our CPUs with * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting * every CPU in the system for a 32-bit task. */ lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask, cpu_active_mask); get_cpu_device(lucky_winner)->offline_disabled = true; setup_elf_hwcaps(compat_elf_hwcaps); elf_hwcap_fixup(); pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n", cpu, lucky_winner); return 0; } static int __init init_32bit_el0_mask(void) { if (!allow_mismatched_32bit_el0) return 0; if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL)) return -ENOMEM; return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/mismatched_32bit_el0:online", enable_mismatched_32bit_el0, NULL); } subsys_initcall_sync(init_32bit_el0_mask); static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { cpu_enable_swapper_cnp(); } /* * We emulate only the following system register space. * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7] * See Table C5-6 System instruction encodings for System register accesses, * ARMv8 ARM(ARM DDI 0487A.f) for more details. */ static inline bool __attribute_const__ is_emulated(u32 id) { return (sys_reg_Op0(id) == 0x3 && sys_reg_CRn(id) == 0x0 && sys_reg_Op1(id) == 0x0 && (sys_reg_CRm(id) == 0 || ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7)))); } /* * With CRm == 0, reg should be one of : * MIDR_EL1, MPIDR_EL1 or REVIDR_EL1. */ static inline int emulate_id_reg(u32 id, u64 *valp) { switch (id) { case SYS_MIDR_EL1: *valp = read_cpuid_id(); break; case SYS_MPIDR_EL1: *valp = SYS_MPIDR_SAFE_VAL; break; case SYS_REVIDR_EL1: /* IMPLEMENTATION DEFINED values are emulated with 0 */ *valp = 0; break; default: return -EINVAL; } return 0; } static int emulate_sys_reg(u32 id, u64 *valp) { struct arm64_ftr_reg *regp; if (!is_emulated(id)) return -EINVAL; if (sys_reg_CRm(id) == 0) return emulate_id_reg(id, valp); regp = get_arm64_ftr_reg_nowarn(id); if (regp) *valp = arm64_ftr_reg_user_value(regp); else /* * The untracked registers are either IMPLEMENTATION DEFINED * (e.g, ID_AFR0_EL1) or reserved RAZ. */ *valp = 0; return 0; } int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt) { int rc; u64 val; rc = emulate_sys_reg(sys_reg, &val); if (!rc) { pt_regs_write_reg(regs, rt, val); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } return rc; } bool try_emulate_mrs(struct pt_regs *regs, u32 insn) { u32 sys_reg, rt; if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn)) return false; /* * sys_reg values are defined as used in mrs/msr instruction. * shift the imm value to get the encoding. */ sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5; rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); return do_emulate_mrs(regs, sys_reg, rt) == 0; } enum mitigation_state arm64_get_meltdown_state(void) { if (__meltdown_safe) return SPECTRE_UNAFFECTED; if (arm64_kernel_unmapped_at_el0()) return SPECTRE_MITIGATED; return SPECTRE_VULNERABLE; } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { switch (arm64_get_meltdown_state()) { case SPECTRE_UNAFFECTED: return sprintf(buf, "Not affected\n"); case SPECTRE_MITIGATED: return sprintf(buf, "Mitigation: PTI\n"); default: return sprintf(buf, "Vulnerable\n"); } }
256 256 86 20 366 366 257 257 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ARM64_KVM_MMU_H__ #define __ARM64_KVM_MMU_H__ #include <asm/page.h> #include <asm/memory.h> #include <asm/mmu.h> #include <asm/cpufeature.h> /* * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express * "negative" addresses. This makes it impossible to directly share * mappings with the kernel. * * Instead, give the HYP mode its own VA region at a fixed offset from * the kernel by just masking the top bits (which are all ones for a * kernel address). We need to find out how many bits to mask. * * We want to build a set of page tables that cover both parts of the * idmap (the trampoline page used to initialize EL2), and our normal * runtime VA space, at the same time. * * Given that the kernel uses VA_BITS for its entire address space, * and that half of that space (VA_BITS - 1) is used for the linear * mapping, we can also limit the EL2 space to (VA_BITS - 1). * * The main question is "Within the VA_BITS space, does EL2 use the * top or the bottom half of that space to shadow the kernel's linear * mapping?". As we need to idmap the trampoline page, this is * determined by the range in which this page lives. * * If the page is in the bottom half, we have to use the top half. If * the page is in the top half, we have to use the bottom half: * * T = __pa_symbol(__hyp_idmap_text_start) * if (T & BIT(VA_BITS - 1)) * HYP_VA_MIN = 0 //idmap in upper half * else * HYP_VA_MIN = 1 << (VA_BITS - 1) * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 * * When using VHE, there are no separate hyp mappings and all KVM * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. */ #ifdef __ASSEMBLY__ #include <asm/alternative.h> /* * Convert a hypervisor VA to a PA * reg: hypervisor address to be converted in place * tmp: temporary register */ .macro hyp_pa reg, tmp ldr_l \tmp, hyp_physvirt_offset add \reg, \reg, \tmp .endm /* * Convert a hypervisor VA to a kernel image address * reg: hypervisor address to be converted in place * tmp: temporary register * * The actual code generation takes place in kvm_get_kimage_voffset, and * the instructions below are only there to reserve the space and * perform the register allocation (kvm_get_kimage_voffset uses the * specific registers encoded in the instructions). */ .macro hyp_kimg_va reg, tmp /* Convert hyp VA -> PA. */ hyp_pa \reg, \tmp /* Load kimage_voffset. */ alternative_cb ARM64_ALWAYS_SYSTEM, kvm_get_kimage_voffset movz \tmp, #0 movk \tmp, #0, lsl #16 movk \tmp, #0, lsl #32 movk \tmp, #0, lsl #48 alternative_cb_end /* Convert PA -> kimg VA. */ add \reg, \reg, \tmp .endm #else #include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/kvm_emulate.h> #include <asm/kvm_host.h> #include <asm/kvm_nested.h> void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void kvm_compute_layout(void); void kvm_apply_hyp_relocations(void); #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) /* * Convert a kernel VA into a HYP VA. * * Can be called from hyp or non-hyp context. * * The actual code generation takes place in kvm_update_va_mask(), and * the instructions below are only there to reserve the space and * perform the register allocation (kvm_update_va_mask() uses the * specific registers encoded in the instructions). */ static __always_inline unsigned long __kern_hyp_va(unsigned long v) { /* * This #ifndef is an optimisation for when this is called from VHE hyp * context. When called from a VHE non-hyp context, kvm_update_va_mask() will * replace the instructions with `nop`s. */ #ifndef __KVM_VHE_HYPERVISOR__ asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" /* mask with va_mask */ "ror %0, %0, #1\n" /* rotate to the first tag bit */ "add %0, %0, #0\n" /* insert the low 12 bits of the tag */ "add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */ "ror %0, %0, #63\n", /* rotate back */ ARM64_ALWAYS_SYSTEM, kvm_update_va_mask) : "+r" (v)); #endif return v; } #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) extern u32 __hyp_va_bits; /* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. */ #define KVM_PHYS_SHIFT (40) #define kvm_phys_shift(mmu) VTCR_EL2_IPA((mmu)->vtcr) #define kvm_phys_size(mmu) (_AC(1, ULL) << kvm_phys_shift(mmu)) #define kvm_phys_mask(mmu) (kvm_phys_size(mmu) - _AC(1, ULL)) #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> int kvm_share_hyp(void *from, void *to); void kvm_unshare_hyp(void *from, void *to); int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int __create_hyp_mappings(unsigned long start, unsigned long size, unsigned long phys, enum kvm_pgtable_prot prot); int hyp_alloc_private_va_range(size_t size, unsigned long *haddr); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); void __init free_hyp_pgds(void); void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, bool may_block); void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); void kvm_uninit_stage2_mmu(struct kvm *kvm); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int __init kvm_mmu_init(u32 *hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) { int idx = slot - (slot != HYP_VECTOR_DIRECT); return base + (idx * SZ_2K); } struct kvm; #define kvm_flush_dcache_to_poc(a,l) \ dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l)) static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) { u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C; int reg; if (vcpu_is_el2(vcpu)) reg = SCTLR_EL2; else reg = SCTLR_EL1; return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits; } static inline void __clean_dcache_guest_page(void *va, size_t size) { /* * With FWB, we ensure that the guest always accesses memory using * cacheable attributes, and we don't have to clean to PoC when * faulting in pages. Furthermore, FWB implies IDC, so cleaning to * PoU is not required either in this case. */ if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return; kvm_flush_dcache_to_poc(va, size); } static inline size_t __invalidate_icache_max_range(void) { u8 iminline; u64 ctr; asm volatile(ALTERNATIVE_CB("movz %0, #0\n" "movk %0, #0, lsl #16\n" "movk %0, #0, lsl #32\n" "movk %0, #0, lsl #48\n", ARM64_ALWAYS_SYSTEM, kvm_compute_final_ctr_el0) : "=r" (ctr)); iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2; return MAX_DVM_OPS << iminline; } static inline void __invalidate_icache_guest_page(void *va, size_t size) { /* * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the * invalidation range exceeds our arbitrary limit on invadations by * cache line. */ if (icache_is_aliasing() || size > __invalidate_icache_max_range()) icache_inval_all_pou(); else icache_inval_pou((unsigned long)va, (unsigned long)va + size); } void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); return get_vmid_bits(reg); } /* * We are not in the kvm->srcu critical section most of the time, so we take * the SRCU read lock here. Since we copy the data from the user page, we * can immediately drop the lock again. */ static inline int kvm_read_guest_lock(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { int srcu_idx = srcu_read_lock(&kvm->srcu); int ret = kvm_read_guest(kvm, gpa, data, len); srcu_read_unlock(&kvm->srcu, srcu_idx); return ret; } static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { int srcu_idx = srcu_read_lock(&kvm->srcu); int ret = kvm_write_guest(kvm, gpa, data, len); srcu_read_unlock(&kvm->srcu, srcu_idx); return ret; } #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) /* * When this is (directly or indirectly) used on the TLB invalidation * path, we rely on a previously issued DSB so that page table updates * and VMID reads are correctly ordered. */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; u64 vmid_field, baddr; u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; vmid_field = atomic64_read(&vmid->id) << VTTBR_VMID_SHIFT; vmid_field &= VTTBR_VMID_MASK(kvm_arm_vmid_bits); return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } /* * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, struct kvm_arch *arch) { write_sysreg(mmu->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL1/EL0 translation regime used by * the guest. */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); } static inline u64 get_vmid(u64 vttbr) { return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >> VTTBR_VMID_SHIFT; } static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu) { return !(mmu->tlb_vttbr & VTTBR_CNP_BIT); } static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) { /* * Be careful, mmu may not be fully initialised so do look at * *any* of its fields. */ return &kvm->arch.mmu != mmu; } static inline void kvm_fault_lock(struct kvm *kvm) { if (is_protected_kvm_enabled()) write_lock(&kvm->mmu_lock); else read_lock(&kvm->mmu_lock); } static inline void kvm_fault_unlock(struct kvm *kvm) { if (is_protected_kvm_enabled()) write_unlock(&kvm->mmu_lock); else read_unlock(&kvm->mmu_lock); } /* * ARM64 KVM relies on a simple conversion from physaddr to a kernel * virtual address (KVA) when it does cache maintenance as the CMO * instructions work on virtual addresses. This is incompatible with * VM_PFNMAP VMAs which may not have a kernel direct mapping to a * virtual address. * * With S2FWB and CACHE DIC features, KVM need not do cache flushing * and CMOs are NOP'd. This has the effect of no longer requiring a * KVA for addresses mapped into the S2. The presence of these features * are thus necessary to support cacheable S2 mapping of VM_PFNMAP. */ static inline bool kvm_supports_cacheable_pfnmap(void) { return cpus_have_final_cap(ARM64_HAS_STAGE2_FWB) && cpus_have_final_cap(ARM64_HAS_CACHE_DIC); } #ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); #else static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} #endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */
15 15 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Skb ref helpers. * */ #ifndef _LINUX_SKBUFF_REF_H #define _LINUX_SKBUFF_REF_H #include <linux/skbuff.h> /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_netmem(skb_frag_netmem(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(netmem_ref netmem); static inline void skb_page_unref(netmem_ref netmem, bool recycle) { #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(netmem)) return; #endif put_netmem(netmem); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { skb_page_unref(skb_frag_netmem(frag), recycle); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } #endif /* _LINUX_SKBUFF_REF_H */
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig) set_bit(IMA_DIGSIG, &iint->atomic_flags); else clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), 0); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), 0); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
295 349 1 323 26 348 22 22 26 26 26 26 26 26 26 26 26 26 26 26 26 346 346 27 27 87 86 87 87 535 5 5 5 4 2 5 4 4 4 4 4 4 4 5 2 4 4 4 5 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 // SPDX-License-Identifier: GPL-2.0-only /* * fs/libfs.c * Library for filesystems writers. */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> #include <linux/pidfs.h> #include <linux/uaccess.h> #include "internal.h" int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } EXPORT_SYMBOL(simple_statfs); /* * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ int always_delete_dentry(const struct dentry *dentry) { return 1; } EXPORT_SYMBOL(always_delete_dentry); const struct dentry_operations simple_dentry_operations = { .d_delete = always_delete_dentry, }; EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) return NULL; d_add(dentry, NULL); return NULL; } EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } EXPORT_SYMBOL(dcache_dir_open); int dcache_dir_close(struct inode *inode, struct file *file) { dput(file->private_data); return 0; } EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ /* * Returns an element of siblings' list. * We are looking for <count>th positive after <p>; if * found, dentry is grabbed and returned to caller. * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); while (*p) { struct dentry *d = hlist_entry(*p, struct dentry, d_sib); p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(found)) break; count = 1; } if (need_resched()) { if (!hlist_unhashed(&cursor->d_sib)) __hlist_del(&cursor->d_sib); hlist_add_behind(&cursor->d_sib, &d->d_sib); p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); } } spin_unlock(&dentry->d_lock); dput(last); return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; inode_lock_shared(dentry->d_inode); if (offset > 2) to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (to) hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); file->f_pos = offset; inode_unlock_shared(dentry->d_inode); } return offset; } EXPORT_SYMBOL(dcache_dir_lseek); /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), * both impossible due to the lock on directory. */ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; struct dentry *next = NULL; struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) p = &dentry->d_children.first; else p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_sib.next; } spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (next) hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); return 0; } EXPORT_SYMBOL(dcache_readdir); ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; } EXPORT_SYMBOL(generic_read_dir); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; EXPORT_SYMBOL(simple_dir_inode_operations); /* simple_offset_add() never assigns these to a dentry */ enum { DIR_OFFSET_FIRST = 2, /* Find first real entry */ DIR_OFFSET_EOD = S32_MAX, }; /* simple_offset_add() allocation range */ enum { DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, }; static void offset_set(struct dentry *dentry, long offset) { dentry->d_fsdata = (void *)offset; } static long dentry2offset(struct dentry *dentry) { return (long)dentry->d_fsdata; } static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx * @octx: directory offset map to be initialized * */ void simple_offset_init(struct offset_ctx *octx) { mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; } /** * simple_offset_add - Add an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: new dentry being added * * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, DIR_OFFSET_MAX, &octx->next_offset, GFP_KERNEL); if (unlikely(ret < 0)) return ret == -EBUSY ? -ENOSPC : ret; offset_set(dentry, offset); return 0; } static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, long offset) { int ret; ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL); if (ret) return ret; offset_set(dentry, offset); return 0; } /** * simple_offset_remove - Remove an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: dentry being removed * */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { long offset; offset = dentry2offset(dentry); if (offset == 0) return; mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } /** * simple_offset_rename - handle directory offsets for rename * @old_dir: parent directory of source entry * @old_dentry: dentry of source entry * @new_dir: parent_directory of destination entry * @new_dentry: dentry of destination * * Caller provides appropriate serialization. * * User space expects the directory offset value of the replaced * (new) directory entry to be unchanged after a rename. * * Returns zero on success, a negative errno value on failure. */ int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long new_offset = dentry2offset(new_dentry); simple_offset_remove(old_ctx, old_dentry); if (new_offset) { offset_set(new_dentry, 0); return simple_offset_replace(new_ctx, old_dentry, new_offset); } return simple_offset_add(new_ctx, old_dentry); } /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved * @new_dir: destination parent * @new_dentry: destination dentry * * This API preserves the directory offset values. Caller provides * appropriate serialization. * * Returns zero on success. Otherwise a negative errno is returned and the * rename is rolled back. */ int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long old_index = dentry2offset(old_dentry); long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); simple_offset_remove(new_ctx, new_dentry); ret = simple_offset_replace(new_ctx, old_dentry, new_index); if (ret) goto out_restore; ret = simple_offset_replace(old_ctx, new_dentry, old_index); if (ret) { simple_offset_remove(new_ctx, old_dentry); goto out_restore; } ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (ret) { simple_offset_remove(new_ctx, old_dentry); simple_offset_remove(old_ctx, new_dentry); goto out_restore; } return 0; out_restore: (void)simple_offset_replace(old_ctx, old_dentry, old_index); (void)simple_offset_replace(new_ctx, new_dentry, new_index); return ret; } /** * simple_offset_destroy - Release offset map * @octx: directory offset ctx that is about to be destroyed * * During fs teardown (eg. umount), a directory's offset map might still * contain entries. xa_destroy() cleans out anything that remains. */ void simple_offset_destroy(struct offset_ctx *octx) { mtree_destroy(&octx->mt); } /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated * @offset: a byte offset * @whence: enumerator describing the starting position for this update * * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. * * Returns the updated read position if successful; otherwise a * negative errno is returned and the read position remains unchanged. */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset >= 0) break; fallthrough; default: return -EINVAL; } return vfs_setpos(file, offset, LONG_MAX); } static struct dentry *find_positive_dentry(struct dentry *parent, struct dentry *dentry, bool next) { struct dentry *found = NULL; spin_lock(&parent->d_lock); if (next) dentry = d_next_sibling(dentry); else if (!dentry) dentry = d_first_child(parent); hlist_for_each_entry_from(dentry, d_sib) { if (!simple_positive(dentry)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(dentry)) found = dget_dlock(dentry); spin_unlock(&dentry->d_lock); if (likely(found)) break; } spin_unlock(&parent->d_lock); return found; } static noinline_for_stack struct dentry * offset_dir_lookup(struct dentry *parent, loff_t offset) { struct inode *inode = d_inode(parent); struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *child, *found = NULL; MA_STATE(mas, &octx->mt, offset, offset); if (offset == DIR_OFFSET_FIRST) found = find_positive_dentry(parent, NULL, false); else { rcu_read_lock(); child = mas_find_rev(&mas, DIR_OFFSET_MIN); found = find_positive_dentry(parent, child, false); rcu_read_unlock(); } return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { struct inode *inode = d_inode(dentry); return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } static void offset_iterate_dir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; struct dentry *dentry; dentry = offset_dir_lookup(dir, ctx->pos); if (!dentry) goto out_eod; while (true) { struct dentry *next; ctx->pos = dentry2offset(dentry); if (!offset_dir_emit(ctx, dentry)) break; next = find_positive_dentry(dir, dentry, true); dput(dentry); if (!next) goto out_eod; dentry = next; } dput(dentry); return; out_eod: ctx->pos = DIR_OFFSET_EOD; } /** * offset_readdir - Emit entries starting at offset @ctx->pos * @file: an open directory to iterate over * @ctx: directory iteration context * * Caller must hold @file's i_rwsem to prevent insertion or removal of * entries during this call. * * On entry, @ctx->pos contains an offset that represents the first entry * to be read from the directory. * * The operation continues until there are no more entries to read, or * until the ctx->actor indicates there is no more space in the caller's * output buffer. * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. * Caller places this value in the d_off field of the last entry in the * user's buffer. * * Return values: * %0 - Complete */ static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos != DIR_OFFSET_EOD) offset_iterate_dir(file, ctx); return 0; } const struct file_operations simple_offset_dir_operations = { .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, .fsync = noop_fsync, }; struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); d = prev ? d_next_sibling(prev) : d_first_child(parent); hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) child = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(child)) break; } } spin_unlock(&parent->d_lock); dput(prev); return child; } EXPORT_SYMBOL(find_next_child); void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; inode_lock(inode); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; this = this->d_parent; inode = this->d_inode; inode_lock(inode); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts if (d_is_dir(victim)) fsnotify_rmdir(inode, victim); else fsnotify_unlink(inode, victim); if (callback) callback(victim); dput(victim); // unpin it } if (victim == dentry) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); dput(dentry); return; } } inode_unlock(inode); this = child; } } EXPORT_SYMBOL(simple_recursive_removal); static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) return -ENOMEM; /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing * max_reserved of 1 to iunique). */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; s->s_d_op = ctx->dops; return 0; } static int pseudo_fs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, pseudo_fs_fill_super); } static void pseudo_fs_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations pseudo_fs_context_ops = { .free = pseudo_fs_free, .get_tree = pseudo_fs_get_tree, }; /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, unsigned long magic) { struct pseudo_fs_context *ctx; ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); if (likely(ctx)) { ctx->magic = magic; fc->fs_private = ctx; fc->ops = &pseudo_fs_context_ops; fc->sb_flags |= SB_NOUSER; fc->global = true; } return ctx; } EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) file->private_data = inode->i_private; return 0; } EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; } spin_unlock(&child->d_lock); } ret = 1; out: spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); int simple_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; } EXPORT_SYMBOL(simple_rmdir); /** * simple_rename_timestamp - update the various inode timestamps for rename * @old_dir: old parent directory * @old_dentry: dentry that is being renamed * @new_dir: new parent directory * @new_dentry: target for rename * * POSIX mandates that the old and new parent directories have their ctime and * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have * their ctime updated. */ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *newino = d_inode(new_dentry); inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (new_dir != old_dir) inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); inode_set_ctime_current(d_inode(old_dentry)); if (newino) inode_set_ctime_current(newino); } EXPORT_SYMBOL_GPL(simple_rename_timestamp); int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { bool old_is_dir = d_is_dir(old_dentry); bool new_is_dir = d_is_dir(new_dentry); if (old_dir != new_dir && old_is_dir != new_is_dir) { if (old_is_dir) { drop_nlink(old_dir); inc_nlink(new_dir); } else { drop_nlink(new_dir); inc_nlink(old_dir); } } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * * Returns 0 on success, -error on failure. * * simple_setattr is a simple ->setattr implementation without a proper * implementation of size changes. * * It can either be used for in-memory filesystems or special files * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } EXPORT_SYMBOL(simple_setattr); static int simple_read_folio(struct file *file, struct folio *folio) { folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); folio_unlock(folio); return 0; } int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct folio *folio; folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + len, folio_size(folio)); } return 0; } EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " * @copied: " * @folio: " * @fsdata: " * * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. * * Use *ONLY* with simple_read_folio() */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* zero the stale part of the folio if we did a short copy */ if (!folio_test_uptodate(folio)) { if (copied < len) { size_t from = offset_in_folio(folio, pos); folio_zero_range(folio, from + copied, len - copied); } folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } /* * Provides ramfs-style behavior: data in the pagecache, but no writeback. */ const struct address_space_operations ram_aops = { .read_folio = simple_read_folio, .write_begin = simple_write_begin, .write_end = simple_write_end, .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; struct dentry *dentry; int i; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; inode = new_inode(s); if (!inode) return -ENOMEM; /* * because the root inode is 1, the files array must not contain an * entry at index 1 */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; /* warn if it tries to conflict with the root inode */ if (unlikely(i == 1)) printk(KERN_WARNING "%s: %s passed in a files array" "with an index of 1!\n", __func__, s->s_type->name); dentry = d_alloc_name(s->s_root, files->name); if (!dentry) return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); } return 0; } EXPORT_SYMBOL(simple_fill_super); static DEFINE_SPINLOCK(pin_fs_lock); int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); if (!*mount) *mount = mnt; } mntget(*mount); ++*count; spin_unlock(&pin_fs_lock); mntput(mnt); return 0; } EXPORT_SYMBOL(simple_pin_fs); void simple_release_fs(struct vfsmount **mount, int *count) { struct vfsmount *mnt; spin_lock(&pin_fs_lock); mnt = *mount; if (!--*count) *mount = NULL; spin_unlock(&pin_fs_lock); mntput(mnt); } EXPORT_SYMBOL(simple_release_fs); /** * simple_read_from_buffer - copy data from the buffer to user space * @to: the user space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The simple_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the user space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; size_t ret; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; ret = copy_to_user(to, from + pos, count); if (ret == count) return -EFAULT; count -= ret; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_read_from_buffer); /** * simple_write_to_buffer - copy data from user space to the buffer * @to: the buffer to write to * @available: the size of the buffer * @ppos: the current position in the buffer * @from: the user space buffer to read from * @count: the maximum number of bytes to read * * The simple_write_to_buffer() function reads up to @count bytes from the user * space address starting at @from into the buffer @to at offset @ppos. * * On success, the number of bytes written is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count) { loff_t pos = *ppos; size_t res; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; res = copy_from_user(to + pos, from, count); if (res == count) return -EFAULT; count -= res; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_write_to_buffer); /** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The memory_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the kernel space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; if (pos < 0) return -EINVAL; if (pos >= available) return 0; if (count > available - pos) count = available - pos; memcpy(to, from + pos, count); *ppos = pos + count; return count; } EXPORT_SYMBOL(memory_read_from_buffer); /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then * possibly a read which collects the result - which is stored in a * file-local buffer. */ void simple_transaction_set(struct file *file, size_t n) { struct simple_transaction_argresp *ar = file->private_data; BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); /* * The barrier ensures that ar->size will really remain zero until * ar->data is ready for reading. */ smp_mb(); ar->size = n; } EXPORT_SYMBOL(simple_transaction_set); char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; static DEFINE_SPINLOCK(simple_transaction_lock); if (size > SIMPLE_TRANSACTION_LIMIT - 1) return ERR_PTR(-EFBIG); ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); if (!ar) return ERR_PTR(-ENOMEM); spin_lock(&simple_transaction_lock); /* only one write allowed per open */ if (file->private_data) { spin_unlock(&simple_transaction_lock); free_page((unsigned long)ar); return ERR_PTR(-EBUSY); } file->private_data = ar; spin_unlock(&simple_transaction_lock); if (copy_from_user(ar->data, buf, size)) return ERR_PTR(-EFAULT); return ar->data; } EXPORT_SYMBOL(simple_transaction_get); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { struct simple_transaction_argresp *ar = file->private_data; if (!ar) return 0; return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); } EXPORT_SYMBOL(simple_transaction_read); int simple_transaction_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); return 0; } EXPORT_SYMBOL(simple_transaction_release); /* Simple attribute files */ struct simple_attr { int (*get)(void *, u64 *); int (*set)(void *, u64); char get_buf[24]; /* enough to store a u64 and "\n\0" */ char set_buf[24]; void *data; const char *fmt; /* format for read operation */ struct mutex mutex; /* protects access to these buffers */ }; /* simple_attr_open is called by an actual attribute open file operation * to set the attribute specific access operations. */ int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { struct simple_attr *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; attr->get = get; attr->set = set; attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); file->private_data = attr; return nonseekable_open(inode, file); } EXPORT_SYMBOL_GPL(simple_attr_open); int simple_attr_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ /* read from the buffer that is filled with the get function */ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; size_t size; ssize_t ret; attr = file->private_data; if (!attr->get) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; if (*ppos && attr->get_buf[0]) { /* continued read */ size = strlen(attr->get_buf); } else { /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) goto out; size = scnprintf(attr->get_buf, sizeof(attr->get_buf), attr->fmt, (unsigned long long)val); } ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); out: mutex_unlock(&attr->mutex); return ret; } EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; size_t size; ssize_t ret; attr = file->private_data; if (!attr->set) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; ret = -EFAULT; size = min(sizeof(attr->set_buf) - 1, len); if (copy_from_user(attr->set_buf, buf, size)) goto out; attr->set_buf[size] = '\0'; if (is_signed) ret = kstrtoll(attr->set_buf, 0, &val); else ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; } ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(simple_attr_write); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** * generic_encode_ino32_fh - generic export_operations->encode_fh function * @inode: the object to encode * @fh: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * @parent: parent directory inode, if wanted * * This generic encode_fh function assumes that the 32 inode number * is suitable for locating an inode, and that the generation number * can be used to check that it is still valid. It places them in the * filehandle fragment where export_decode_fh expects to find them. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct fid *fid = (void *)fh; int len = *max_len; int type = FILEID_INO32_GEN; if (parent && (len < 4)) { *max_len = 4; return FILEID_INVALID; } else if (len < 2) { *max_len = 2; return FILEID_INVALID; } len = 2; fid->i32.ino = inode->i_ino; fid->i32.gen = inode->i_generation; if (parent) { fid->i32.parent_ino = parent->i_ino; fid->i32.parent_gen = parent->i_generation; len = 4; type = FILEID_INO32_GEN_PARENT; } *max_len = len; return type; } EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the object specified in the file handle. */ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.ino, fid->i32.gen); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the _parent_ object specified in the file handle if it * is specified in the file handle, or NULL otherwise. */ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len <= 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.parent_ino, (fh_len > 3 ? fid->i32.parent_gen : 0)); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** * __generic_file_fsync - generic fsync implementation for simple filesystems * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: inode_unlock(inode); /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); /** * generic_file_fsync - generic fsync implementation for simple filesystems * with flush * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * */ int generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; err = __generic_file_fsync(file, start, end, datasync); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); /** * generic_check_addressable - Check addressability of file system * @blocksize_bits: log of file system block size * @num_blocks: number of blocks in file system * * Determine whether a file system with @num_blocks blocks (and a * block size of 2**@blocksize_bits) is addressable by the sector_t * and page cache of the system. Return 0 if so and -EFBIG otherwise. */ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page = last_fs_block >> (PAGE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; } EXPORT_SYMBOL(generic_check_addressable); /* * No-op implementation of ->fsync for in-memory filesystems. */ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; } EXPORT_SYMBOL(noop_fsync); ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* * iomap based filesystems support direct I/O without need for * this callback. However, it still needs to be set in * inode->a_ops so that open/fcntl know that direct I/O is * generally supported. */ return -EINVAL; } EXPORT_SYMBOL_GPL(noop_direct_IO); /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { kfree(p); } EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &anon_aops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; /* * Historically anonymous inodes didn't have a type at all and * userspace has come to rely on this. Internally they're just * regular files but S_IFREG is masked off when reporting * information to userspace. */ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE | S_ANON_INODE; simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); /** * simple_nosetlease - generic helper for prohibiting leases * @filp: file pointer * @arg: type of lease to obtain * @flp: new lease supplied for insertion * @priv: private data for lm_setup operation * * Generic helper for filesystems that do not wish to allow leases to be set. * All arguments are ignored and it just returns -EINVAL. */ int simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); /** * simple_get_link - generic helper to get the target of "fast" symlinks * @dentry: not used here * @inode: the symlink inode * @done: not used here * * Generic helper for filesystems to use for symlink inodes where a pointer to * the symlink target is stored in ->i_link. NOTE: this isn't normally called, * since as an optimization the path lookup code uses any non-NULL ->i_link * directly, without calling ->get_link(). But ->get_link() still must be set, * to mark the inode_operations as being for a symlink. * * Return: the symlink target */ const char *simple_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return inode->i_link; } EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, }; EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. */ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-ENOENT); } static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; } static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; } static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, .setattr = empty_dir_setattr, .listxattr = empty_dir_listxattr, }; static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) { /* An empty directory has two entries . and .. at offsets 0 and 1 */ return generic_file_llseek_size(file, offset, whence, 2, 2); } static int empty_dir_readdir(struct file *file, struct dir_context *ctx) { dir_emit_dots(file, ctx); return 0; } static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; void make_empty_dir_inode(struct inode *inode) { set_nlink(inode, 2); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } bool is_empty_dir_inode(struct inode *inode) { return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } #if IS_ENABLED(CONFIG_UNICODE) /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against * @len: len of name of dentry * @str: str pointer to name of dentry * @name: Name to compare against * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; union shortname_store strbuf; struct qstr qstr; /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. * * This comparison is safe under RCU because the caller * guarantees the consistency between str and len. See * __d_lookup_rcu_op_compare() for details. */ if (len == name->len && !memcmp(str, name->name, len)) return 0; parent = READ_ONCE(dentry->d_parent); dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) return 1; qstr.len = len; qstr.name = str; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. * As above, len is guaranteed to match str, so the shortname case * is exactly when str points to ->d_shortname. */ if (qstr.name == dentry->d_shortname.string) { strbuf = dentry->d_shortname; // NUL is guaranteed to be in there qstr.name = strbuf.string; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems * @dentry: dentry of the parent directory * @str: qstr of name whose hash we should fill in * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); if (ret < 0 && sb_has_strict_encoding(sb)) return -EINVAL; return 0; } EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, #ifdef CONFIG_FS_ENCRYPTION .d_revalidate = fscrypt_d_revalidate, #endif }; /** * generic_ci_match() - Match a name (case-insensitively) with a dirent. * This is a filesystem helper for comparison with directory entries. * generic_ci_d_compare should be used in VFS' ->d_compare instead. * * @parent: Inode of the parent of the dirent under comparison * @name: name under lookup. * @folded_name: Optional pre-folded name under lookup * @de_name: Dirent name. * @de_name_len: dirent name length. * * Test whether a case-insensitive directory entry matches the filename * being searched. If @folded_name is provided, it is used instead of * recalculating the casefold of @name. * * Return: > 0 if the directory entry matches, 0 if it doesn't match, or * < 0 on error. */ int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr dirent = QSTR_INIT(de_name, de_name_len); int res = 0; if (IS_ENCRYPTED(parent)) { const struct fscrypt_str encrypted_name = FSTR_INIT((u8 *) de_name, de_name_len); if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) return -EINVAL; decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); if (!decrypted_name.name) return -ENOMEM; res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, &decrypted_name); if (res < 0) { kfree(decrypted_name.name); return res; } dirent.name = decrypted_name.name; dirent.len = decrypted_name.len; } /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. */ if (dirent.len == name->len && !memcmp(name->name, dirent.name, dirent.len)) goto out; if (folded_name->name) res = utf8_strncasecmp_folded(um, folded_name, &dirent); else res = utf8_strncasecmp(um, name, &dirent); out: kfree(decrypted_name.name); if (res < 0 && sb_has_strict_encoding(sb)) { pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); return 0; } return !res; } EXPORT_SYMBOL(generic_ci_match); #endif #ifdef CONFIG_FS_ENCRYPTION static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif /** * generic_set_sb_d_ops - helper for choosing the set of * filesystem-wide dentry operations for the enabled features * @sb: superblock to be configured * * Filesystems supporting casefolding and/or fscrypt can call this * helper at mount-time to configure sb->s_d_op to best set of dentry * operations required for the enabled features. The helper must be * called after these have been configured, but before the root dentry * is created. */ void generic_set_sb_d_ops(struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) { sb->s_d_op = &generic_ci_dentry_ops; return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (sb->s_cop) { sb->s_d_op = &generic_encrypted_dentry_ops; return; } #endif } EXPORT_SYMBOL(generic_set_sb_d_ops); /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated * @force: increment the counter even if it's not necessary? * * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * * If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. * * In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. */ bool inode_maybe_inc_iversion(struct inode *inode, bool force) { u64 cur, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * We add a full memory barrier to ensure that any de facto ordering * with other state is preserved (either implicitly coming from cmpxchg * or explicitly from smp_mb if we don't know upfront if we will execute * the former). * * These barriers pair with inode_query_iversion(). */ cur = inode_peek_iversion_raw(inode); if (!force && !(cur & I_VERSION_QUERIED)) { smp_mb(); cur = inode_peek_iversion_raw(inode); } do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); /** * inode_query_iversion - read i_version for later use * @inode: inode from which i_version should be read * * Read the inode i_version counter. This should be used by callers that wish * to store the returned i_version for later comparison. This will guarantee * that a later query of the i_version will result in a different value if * anything has changed. * * In this implementation, we fetch the current value, set the QUERIED flag and * then try to swap it into place with a cmpxchg, if it wasn't already set. If * that fails, we try again with the newly fetched value from the cmpxchg. */ u64 inode_query_iversion(struct inode *inode) { u64 cur, new; bool fenced = false; /* * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with * inode_maybe_inc_iversion(), see that routine for more details. */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { if (!fenced) smp_mb(); break; } fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos - buffered_written; loff_t end = iocb->ki_pos - 1; int err; /* * If the buffered write fallback returned an error, we want to return * the number of bytes which were written by direct I/O, or the error * code if that was zero. * * Note that this differs from normal direct-io semantics, which will * return -EFOO even if some bytes were written. */ if (unlikely(buffered_written < 0)) { if (direct_written) return direct_written; return buffered_written; } /* * We need to ensure that the page cache pages are written to disk and * invalidated to preserve the expected O_DIRECT semantics. */ err = filemap_write_and_wait_range(mapping, pos, end); if (err < 0) { /* * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; } invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); /** * simple_inode_init_ts - initialize the timestamps for a new inode * @inode: inode to be initialized * * When a new inode is created, most filesystems set the timestamps to the * current time. Add a helper to do this. */ struct timespec64 simple_inode_init_ts(struct inode *inode) { struct timespec64 ts = inode_set_ctime_current(inode); inode_set_atime_to_ts(inode, ts); inode_set_mtime_to_ts(inode, ts); return ts; } EXPORT_SYMBOL(simple_inode_init_ts); struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; guard(rcu)(); dentry = rcu_dereference(*stashed); if (!dentry) return NULL; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; } static struct dentry *prepare_anon_dentry(struct dentry **stashed, struct super_block *sb, void *data) { struct dentry *dentry; struct inode *inode; const struct stashed_operations *sops = sb->s_fs_info; int ret; inode = new_inode_pseudo(sb); if (!inode) { sops->put_data(data); return ERR_PTR(-ENOMEM); } inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG; simple_inode_init_ts(inode); ret = sops->init_inode(inode, data); if (ret < 0) { iput(inode); return ERR_PTR(ret); } /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } /* Store address of location where dentry's supposed to be stashed. */ dentry->d_fsdata = stashed; /* @data is now owned by the fs */ d_instantiate(dentry, inode); return dentry; } static struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { struct dentry *old; /* Assume any old dentry was cleared out. */ old = cmpxchg(stashed, NULL, dentry); if (likely(!old)) return dentry; /* Check if somebody else installed a reusable dentry. */ if (lockref_get_not_dead(&old->d_lockref)) return old; /* There's an old dead dentry there, try to take it over. */ if (likely(try_cmpxchg(stashed, &old, dentry))) return dentry; } } /** * path_from_stashed - create path from stashed or new dentry * @stashed: where to retrieve or stash dentry * @mnt: mnt of the filesystems to use * @data: data to store in inode->i_private * @path: path to create * * The function tries to retrieve a stashed dentry from @stashed. If the dentry * is still valid then it will be reused. If the dentry isn't able the function * will allocate a new dentry and inode. It will then check again whether it * can reuse an existing dentry in case one has been added in the meantime or * update @stashed with the newly added dentry. * * Special-purpose helper for nsfs and pidfs. * * Return: On success zero and on failure a negative error is returned. */ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { struct dentry *dentry; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; } /* Allocate a new dentry. */ dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data); if (IS_ERR(dentry)) return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ path->dentry = stash_dentry(stashed, dentry); if (path->dentry != dentry) dput(dentry); out_path: WARN_ON_ONCE(path->dentry->d_fsdata != stashed); WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); path->mnt = mntget(mnt); return 0; } void stashed_dentry_prune(struct dentry *dentry) { struct dentry **stashed = dentry->d_fsdata; struct inode *inode = d_inode(dentry); if (WARN_ON_ONCE(!stashed)) return; if (!inode) return; /* * Only replace our own @dentry as someone else might've * already cleared out @dentry and stashed their own * dentry in there. */ cmpxchg(stashed, dentry, NULL); }
951 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_PGTABLE_PROT_H #define __ASM_PGTABLE_PROT_H #include <asm/memory.h> #include <asm/pgtable-hwdef.h> #include <linux/const.h> /* * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ #else #define PTE_UFFD_WP (_AT(pteval_t, 0)) #define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define _PAGE_KERNEL (PROT_NORMAL) #define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) #define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> #include <asm/pgtable-types.h> #include <asm/rsi.h> extern bool arm64_use_ng_mappings; extern unsigned long prot_ns_shared; #define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) #ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false #define PTE_MAYBE_SHARED PTE_SHARED #define PMD_MAYBE_SHARED PMD_SECT_S #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #else static inline bool __pure lpa2_is_enabled(void) { return read_tcr() & TCR_DS; } #define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) #define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) #define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48) #endif /* * Highest possible physical address supported. */ #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. */ #define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) #define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ __val; \ }) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) #define PAGE_READONLY __pgprot(_PAGE_READONLY) #define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC) #define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY) #endif /* __ASSEMBLY__ */ #define pte_pi_index(pte) ( \ ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \ ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \ ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \ ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0))) /* * Page types used via Permission Indirection Extension (PIE). PIE uses * the USER, DBM, PXN and UXN bits to to generate an index which is used * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define * combinations we use on non-PIE systems with the same encoding, for * convenience these are listed here as comments as are the unallocated * encodings. */ /* 0: PAGE_DEFAULT */ /* 1: PTE_USER */ /* 2: PTE_WRITE */ /* 3: PTE_WRITE | PTE_USER */ /* 4: PAGE_EXECONLY PTE_PXN */ /* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */ /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ /* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ /* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) #define PAGE_GCS __pgprot(_PAGE_GCS) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */
366 215 233 234 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ #include <linux/cred.h> #include <linux/fs.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include "internal.h" /* * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, * never from raw values. These are just internal helpers. */ #define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } #define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } struct mnt_idmap { struct uid_gid_map uid_map; struct uid_gid_map gid_map; refcount_t count; }; /* * Carries the initial idmapping of 0:0:4294967295 which is an identity * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /* * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. */ struct mnt_idmap invalid_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(invalid_mnt_idmap); /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, * [...], 1000 to 1000 [...]. * * Return: true if this is the initial mapping, false if not. */ static inline bool initial_idmapping(const struct user_namespace *ns) { return ns == &init_user_ns; } /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped * * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() * directly. * * Return: @kuid mapped according to @idmap. * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is * returned. */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); /** * make_vfsgid - map a filesystem kgid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped * * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() * directly. * * Return: @kgid mapped according to @idmap. * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is * returned. */ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid : vfsuid to be mapped * * Map @vfsuid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsuid to inode->i_uid. * * Return: @vfsuid mapped into the filesystem idmapping */ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); if (idmap == &invalid_mnt_idmap) return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) return KUIDT_INIT(uid); return make_kuid(fs_userns, uid); } EXPORT_SYMBOL_GPL(from_vfsuid); /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid : vfsgid to be mapped * * Map @vfsgid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsgid to inode->i_gid. * * Return: @vfsgid mapped into the filesystem idmapping */ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); if (idmap == &invalid_mnt_idmap) return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) return KGIDT_INIT(gid); return make_kgid(fs_userns, gid); } EXPORT_SYMBOL_GPL(from_vfsgid); #ifdef CONFIG_MULTIUSER /** * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups * @vfsgid: the mnt gid to match * * This function can be used to determine whether @vfsuid matches any of the * caller's groups. * * Return: 1 if vfsuid matches caller's groups, 0 if not. */ int vfsgid_in_group_p(vfsgid_t vfsgid) { return in_group_p(AS_KGIDT(vfsgid)); } #else int vfsgid_in_group_p(vfsgid_t vfsgid) { return 1; } #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); static int copy_mnt_idmap(struct uid_gid_map *map_from, struct uid_gid_map *map_to) { struct uid_gid_extent *forward, *reverse; u32 nr_extents = READ_ONCE(map_from->nr_extents); /* Pairs with smp_wmb() when writing the idmapping. */ smp_rmb(); /* * Don't blindly copy @map_to into @map_from if nr_extents is * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we * read @nr_extents someone could have written an idmapping and * then we might end up with inconsistent data. So just don't do * anything at all. */ if (nr_extents == 0) return -EINVAL; /* * Here we know that nr_extents is greater than zero which means * a map has been written. Since idmappings can't be changed * once they have been written we know that we can safely copy * from @map_to into @map_from. */ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { *map_to = *map_from; return 0; } forward = kmemdup_array(map_from->forward, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; reverse = kmemdup_array(map_from->reverse, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; } /* * The idmapping isn't exposed anywhere so we don't need to care * about ordering between extent pointers and @nr_extents * initialization. */ map_to->forward = forward; map_to->reverse = reverse; map_to->nr_extents = nr_extents; return 0; } static void free_mnt_idmap(struct mnt_idmap *idmap) { if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->uid_map.forward); kfree(idmap->uid_map.reverse); } if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->gid_map.forward); kfree(idmap->gid_map.reverse); } kfree(idmap); } struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); refcount_set(&idmap->count, 1); ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); if (!ret) ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); if (ret) { free_mnt_idmap(idmap); idmap = ERR_PTR(ret); } return idmap; } /** * mnt_idmap_get - get a reference to an idmapping * @idmap: the idmap to bump the reference on * * If @idmap is not the @nop_mnt_idmap bump the reference count. * * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; } EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping * @idmap: the idmap to put the reference on * * If this is a non-initial idmapping, put the reference count when a mount is * released and free it if we're the last user. */ void mnt_idmap_put(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) { struct uid_gid_map *map, *map_up; u32 idx, nr_mappings; if (!is_valid_mnt_idmap(idmap)) return 0; /* * Idmappings are shown relative to the caller's idmapping. * This is both the most intuitive and most useful solution. */ if (uid_map) { map = &idmap->uid_map; map_up = &current_user_ns()->uid_map; } else { map = &idmap->gid_map; map_up = &current_user_ns()->gid_map; } for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { uid_t lower; struct uid_gid_extent *extent; if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = &map->extent[idx]; else extent = &map->forward[idx]; /* * Verify that the whole range of the mapping can be * resolved in the caller's idmapping. If it cannot be * resolved skip the mapping. */ lower = map_id_range_up(map_up, extent->lower_first, extent->count); if (lower == (uid_t) -1) continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) return -EAGAIN; nr_mappings++; } return nr_mappings; }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 /* Copyright (c) 2018, Mellanox Technologies All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <crypto/aead.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/dst.h> #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> #include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ static DECLARE_RWSEM(device_offload_lock); static struct workqueue_struct *destruct_wq __read_mostly; static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); static struct page *dummy_page; static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) kfree(tls_offload_ctx_tx(ctx)); if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); tls_ctx_free(NULL, ctx); } static void tls_device_tx_del_task(struct work_struct *work) { struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; struct net_device *netdev; /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); ctx->netdev = NULL; tls_device_free_ctx(ctx); } static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { struct net_device *netdev; unsigned long flags; bool async_cleanup; spin_lock_irqsave(&tls_device_lock, flags); if (unlikely(!refcount_dec_and_test(&ctx->refcount))) { spin_unlock_irqrestore(&tls_device_lock, flags); return; } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); /* queue_work inside the spinlock * to make sure tls_device_down waits for that work. */ queue_work(destruct_wq, &offload_ctx->destruct_work); } spin_unlock_irqrestore(&tls_device_lock, flags); if (!async_cleanup) tls_device_free_ctx(ctx); } /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { struct dst_entry *dst = sk_dst_get(sk); struct net_device *netdev = NULL; if (likely(dst)) { netdev = netdev_sk_get_lowest_dev(dst->dev, sk); dev_hold(netdev); } dst_release(dst); return netdev; } static void destroy_record(struct tls_record_info *record) { int i; for (i = 0; i < record->num_frags; i++) __skb_frag_unref(&record->frags[i], false); kfree(record); } static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { list_del(&info->list); destroy_record(info); } offload_ctx->retransmit_hint = NULL; } static void tls_tcp_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) break; list_del(&info->list); destroy_record(info); deleted_records++; } ctx->unacked_record_sn += deleted_records; spin_unlock_irqrestore(&ctx->lock, flags); } /* At this point, there should be no references on this * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); tls_ctx->sk_destruct(sk); if (tls_ctx->tx_conf == TLS_HW) { if (ctx->open_record) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); clean_acked_data_disable(tcp_sk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_free_partial_record(sk, tls_ctx); } void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); } EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; int err = 0; u8 *rcd_sn; tcp_write_collapse_fence(sk); rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_TX); up_read(&device_offload_lock); if (err) return; clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) { skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, size); ++record->num_frags; get_page(pfrag->page); } pfrag->offset += size; record->len += size; } static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); skb_frag_t *frag; int i; record->end_seq = tp->write_seq + record->len; list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) tls_device_resync_tx(sk, ctx, tp->write_seq); tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); /* all ready, send */ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, struct page_frag *pfrag, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) * if we can't allocate memory now use the dummy page */ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { dummy_tag_frag.page = dummy_page; dummy_tag_frag.offset = 0; pfrag = &dummy_tag_frag; } tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { struct tls_record_info *record; skb_frag_t *frag; record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; frag = &record->frags[0]; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; record->num_frags = 1; record->len = prepend_size; offload_ctx->open_record = record; return 0; } static int tls_do_allocation(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { int ret; if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); if (ret) return ret; if (pfrag->size > pfrag->offset) return 0; } if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; return 0; } static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) { size_t pre_copy, nocache; pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); if (pre_copy) { pre_copy = min(pre_copy, bytes); if (copy_from_iter(addr, pre_copy, i) != pre_copy) return -EFAULT; bytes -= pre_copy; addr += pre_copy; } nocache = round_down(bytes, SMP_CACHE_BYTES); if (copy_from_iter_nocache(addr, nocache, i) != nocache) return -EFAULT; bytes -= nocache; addr += nocache; if (bytes && copy_from_iter(addr, bytes, i) != bytes) return -EFAULT; return 0; } static int tls_push_data(struct sock *sk, struct iov_iter *iter, size_t size, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; bool more = false; bool done = false; int copy, rc = 0; long timeo; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) return -EINVAL; if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; tls_push_record_flags = flags | MSG_MORE; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); if (rc < 0) return rc; } pfrag = sk_page_frag(sk); /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; record = ctx->open_record; if (!record) break; handle_error: if (record_type != TLS_RECORD_TYPE_DATA) { /* avoid sending partial * record with type != * application_data */ size = orig_size; destroy_record(record); ctx->open_record = NULL; } else if (record->len > prot->prepend_size) { goto last_record; } break; } record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; struct page **pages = &zc_pfrag.page; size_t off; rc = iov_iter_extract_pages(iter, &pages, copy, 1, 0, &off); if (rc <= 0) { if (rc == 0) rc = -EIO; goto handle_error; } copy = rc; if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { iov_iter_revert(iter, copy); rc = -EIO; goto handle_error; } zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); } size -= copy; if (!size) { last_record: tls_push_record_flags = flags; if (flags & MSG_MORE) { more = true; break; } done = true; } if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { tls_device_record_close(sk, tls_ctx, record, pfrag, record_type); rc = tls_push_record(sk, tls_ctx, ctx, record, tls_push_record_flags); if (rc < 0) break; } } while (!done); tls_ctx->pending_open_record_frags = more; if (orig_size - size > 0) rc = orig_size - size; return rc; } int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; if (!tls_ctx->zerocopy_sendfile) msg->msg_flags &= ~MSG_SPLICE_PAGES; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, record_type); out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return rc; } void tls_device_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter iter = {}; if (!tls_is_partially_sent_record(tls_ctx)) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (tls_is_partially_sent_record(tls_ctx)) { iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); if (!info) return NULL; /* send the start_marker record if seq number is before the * tls offload start marker sequence number. This record is * required to handle TCP packets which are before TLS offload * started. * And if it's not start marker, look if this seq number * belongs to the list. */ if (likely(!tls_record_is_start_marker(info))) { /* we have the first record, get the last record to see * if this seq number belongs to the list. */ last = list_last_entry(&context->records_list, struct tls_record_info, list); if (!between(seq, tls_record_start_seq(info), last->end_seq)) return NULL; } record_sn = context->unacked_record_sn; } /* We just need the _rcu for the READ_ONCE() */ rcu_read_lock(); list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, context->retransmit_hint->end_seq)) { context->hint_record_sn = record_sn; context->retransmit_hint = info; } *p_record_sn = record_sn; goto exit_rcu_unlock; } record_sn++; } info = NULL; exit_rcu_unlock: rcu_read_unlock(); return info; } EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; WARN_ON_ONCE(sk->sk_write_pending); sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); rcu_read_unlock(); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); u16 i; *rcd_delta = 0; if (is_async) { /* shouldn't get to wraparound: * too long in async stage, something bad happened */ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) return false; /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ if (before(*seq, req_seq)) return false; if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; resync_async->rcd_delta++; return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ for (i = 0; i < resync_async->loglen; i++) if (req_seq == resync_async->log[i] && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; resync_async->loglen = 0; resync_async->rcd_delta = 0; return true; } resync_async->loglen = 0; resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) return true; return false; } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) return; if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) return; prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); switch (rx_ctx->resync_type) { case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; is_req_pending = resync_req; if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: if (likely(!rx_ctx->resync_nh_do_now)) return; /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ sock_data = tcp_inq(sk); if (sock_data > rcd_len) { trace_tls_device_rx_resync_nh_delay(sk, sock_data, rcd_len); return; } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: resync_req = atomic64_read(&rx_ctx->resync_async->req); is_req_pending = resync_req; if (likely(!is_req_pending)) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, resync_req, &seq, &rcd_delta)) return; tls_bigint_subtract(rcd_sn, rcd_delta); break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); } static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, struct tls_offload_context_rx *ctx, struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm; /* device will request resyncs by itself based on stream scan */ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) return; /* already scheduled */ if (ctx->resync_nh_do_now) return; /* seen decrypted fragments since last fully-failed record */ if (ctx->resync_nh_reset) { ctx->resync_nh_reset = 0; ctx->resync_nh.decrypted_failed = 1; ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; return; } if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) return; /* doing resync, bump the next target in case it fails */ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) ctx->resync_nh.decrypted_tgt *= 2; else ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; rxm = strp_msg(skb); /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); tls_bigint_increment(rcd_sn, prot->rec_seq_size); tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, rcd_sn); } } static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); DEBUG_NET_WARN_ON_ONCE(!cipher_desc || !cipher_desc->offloadable); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; err = tls_strp_msg_cow(sw_ctx); if (unlikely(err)) goto free_buf; skb = tls_strp_msg(sw_ctx); rxm = strp_msg(skb); offset = rxm->offset; sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, sg); if (err != -EBADMSG) goto free_buf; else err = 0; data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) { err = skb_store_bits(skb, offset, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; } pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { int frag_pos; /* Practically all frags must belong to msg if reencrypt * is needed with current strparser and coalescing logic, * but strparser may "get optimized", so let's be safe. */ if (pos + skb_iter->len <= offset) goto done_with_frag; if (pos >= data_len + rxm->offset) break; frag_pos = offset - pos; copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); if (skb_iter->decrypted) { err = skb_store_bits(skb_iter, frag_pos, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; done_with_frag: pos += skb_iter->len; } free_buf: kfree(orig_buf); return err; } int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); int is_decrypted, is_encrypted; if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { is_decrypted = skb->decrypted; is_encrypted = !is_decrypted; } else { is_decrypted = 0; is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, tls_ctx->rx.rec_seq, rxm->full_len, is_encrypted, is_decrypted); if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) { if (likely(is_encrypted || is_decrypted)) return is_decrypted; /* After tls_device_down disables the offload, the next SKB will * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ if (is_decrypted) { ctx->resync_nh_reset = 1; return is_decrypted; } if (is_encrypted) { tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); return 0; } ctx->resync_nh_reset = 1; return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, struct net_device *netdev) { if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *ctx) { struct tls_offload_context_tx *offload_ctx; __be64 rcd_sn; offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); if (!offload_ctx) return NULL; INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task); INIT_LIST_HEAD(&offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); sg_init_table(offload_ctx->sg_tx_data, ARRAY_SIZE(offload_ctx->sg_tx_data)); /* start at rec_seq - 1 to account for the start marker record */ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; offload_ctx->ctx = ctx; return offload_ctx; } int tls_set_device_offload(struct sock *sk) { struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; const struct tls_cipher_desc *cipher_desc; struct tls_crypto_info *crypto_info; struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; char *iv, *rec_seq; int rc; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; if (ctx->priv_ctx_tx) return -EEXIST; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { rc = -EOPNOTSUPP; goto release_netdev; } crypto_info = &ctx->crypto_send.info; if (crypto_info->version != TLS_1_2_VERSION) { rc = -EOPNOTSUPP; goto release_netdev; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } rc = init_prot_info(prot, crypto_info, cipher_desc); if (rc) goto release_netdev; iv = crypto_info_iv(crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq); start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); if (!start_marker_record) { rc = -ENOMEM; goto release_netdev; } offload_ctx = alloc_offload_ctx_tx(ctx); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; } rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) goto free_offload_ctx; start_marker_record->end_seq = tcp_sk(sk)->write_seq; start_marker_record->len = 0; start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); clean_acked_data_enable(tcp_sk(sk), &tls_tcp_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ tcp_write_collapse_fence(sk); /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); return 0; release_lock: up_read(&device_offload_lock); clean_acked_data_disable(tcp_sk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); release_netdev: dev_put(netdev); return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) { rc = -ENOMEM; goto release_lock; } context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); info = (void *)&ctx->crypto_recv.info; trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); dev_put(netdev); return 0; free_sw_resources: up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_lock: up_read(&device_offload_lock); release_netdev: dev_put(netdev); return rc; } void tls_device_offload_cleanup_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct net_device *netdev; down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); tls_sw_release_resources_rx(sk); } static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; unsigned long flags; LIST_HEAD(list); /* Request a write lock to block new offload attempts */ down_write(&device_offload_lock); spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { struct net_device *ctx_netdev = rcu_dereference_protected(ctx->netdev, lockdep_is_held(&device_offload_lock)); if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; list_move(&ctx->list, &list); } spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); /* Sync with inflight packets. After this point: * TX: no non-encrypted packets will be passed to the driver. * RX: resync requests from the driver will be ignored. */ synchronize_net(); /* Release the offload context on the driver side. */ if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); if (ctx->rx_conf == TLS_HW && !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); dev_put(netdev); /* Move the context to a separate list for two reasons: * 1. When the context is deallocated, list_del is called. * 2. It's no longer an offloaded context, so we don't want to * run offload-specific code on this context. */ spin_lock_irqsave(&tls_device_lock, flags); list_move_tail(&ctx->list, &tls_device_down_list); spin_unlock_irqrestore(&tls_device_lock, flags); /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ if (refcount_dec_and_test(&ctx->refcount)) { /* sk_destruct ran after tls_device_down took a ref, and * it returned early. Complete the destruction here. */ list_del(&ctx->list); tls_device_free_ctx(ctx); } } up_write(&device_offload_lock); flush_workqueue(destruct_wq); return NOTIFY_DONE; } static int tls_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!dev->tlsdev_ops && !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if (netif_is_bond_master(dev)) return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) return NOTIFY_DONE; else return NOTIFY_BAD; case NETDEV_DOWN: return tls_device_down(dev); } return NOTIFY_DONE; } static struct notifier_block tls_dev_notifier = { .notifier_call = tls_dev_event, }; int __init tls_device_init(void) { int err; dummy_page = alloc_page(GFP_KERNEL); if (!dummy_page) return -ENOMEM; destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; } err = register_netdevice_notifier(&tls_dev_notifier); if (err) goto err_destroy_wq; return 0; err_destroy_wq: destroy_workqueue(destruct_wq); err_free_dummy: put_page(dummy_page); return err; } void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); put_page(dummy_page); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_ROUTE_H #define _NET_IP6_ROUTE_H #include <net/addrconf.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <net/sock.h> #include <net/lwtunnel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/route.h> #include <net/nexthop.h> struct route_info { __u8 type; __u8 length; __u8 prefix_len; #if defined(__BIG_ENDIAN_BITFIELD) __u8 reserved_h:3, route_pref:2, reserved_l:3; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved_l:3, route_pref:2, reserved_h:3; #endif __be32 lifetime; __u8 prefix[]; /* 0,8 or 16 */ }; #define RT6_LOOKUP_F_IFACE 0x00000001 #define RT6_LOOKUP_F_REACHABLE 0x00000002 #define RT6_LOOKUP_F_HAS_SADDR 0x00000004 #define RT6_LOOKUP_F_SRCPREF_TMP 0x00000008 #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 #define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 #define RT6_LOOKUP_F_DST_NOREF 0x00000080 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header */ #define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr)) /* * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate * between IPV6_ADDR_PREFERENCES socket option values * IPV6_PREFER_SRC_TMP = 0x1 * IPV6_PREFER_SRC_PUBLIC = 0x2 * IPV6_PREFER_SRC_COA = 0x4 * and above RT6_LOOKUP_F_SRCPREF_xxx flags. */ static inline int rt6_srcprefs2flags(unsigned int srcprefs) { return (srcprefs & IPV6_PREFER_SRC_MASK) << 3; } static inline unsigned int rt6_flags2srcprefs(int flags) { return (flags >> 3) & IPV6_PREFER_SRC_MASK; } static inline bool rt6_need_strict(const struct in6_addr *daddr) { return ipv6_addr_type(daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } /* fib entries using a nexthop object can not be coalesced into * a multipath route */ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { /* the RTF_ADDRCONF flag filters out RA's */ return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh && f6i->fib6_nh->fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); static inline struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { return ip6_route_output_flags(net, sk, fl6, 0); } /* Only conditionally release dst if flags indicates * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. */ static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) { if (!(flags & RT6_LOOKUP_F_DST_NOREF) || !list_empty(&rt->dst.rt_uncached)) ip6_rt_put(rt); } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, const struct sk_buff *skb, int flags); void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, int l3mdev_index, struct in6_addr *saddr) { struct net_device *l3mdev; struct net_device *dev; bool same_vrf; int err = 0; rcu_read_lock(); l3mdev = dev_get_by_index_rcu(net, l3mdev_index); if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev) dev = f6i ? fib6_info_nh_dev(f6i) : NULL; same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev; if (f6i && f6i->fib6_prefsrc.plen && same_vrf) *saddr = f6i->fib6_prefsrc.addr; else err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr); rcu_read_unlock(); return err; } struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int flags); u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags, struct netlink_ext_ack *extack); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); /* * support functions for ND * */ struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, u32 defrtr_usr_metric, int lifetime); void rt6_purge_dflt_routers(struct net *net); int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; struct rt6_rtnl_dump_arg { struct sk_buff *skb; struct netlink_callback *cb; struct net *net; struct fib_dump_filter filter; }; int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned char nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_multipath_rebalance(struct fib6_info *f6i); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { const struct dst_entry *dst = skb_dst(skb); if (dst) return dst_rt6_info(dst); return NULL; } /* * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif } void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6); static inline bool ipv6_unicast_destination(const struct sk_buff *skb) { const struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); return rt->rt6i_flags & RTF_LOCAL; } static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); return rt->rt6i_flags & RTF_ANYCAST || (rt->rt6i_dst.plen < 127 && !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); } int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) { const struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { mtu = READ_ONCE(dst->dev->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { mtu = dst_mtu(dst); } return mtu; } static inline bool ip6_sk_accept_pmtu(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); return pmtudisc != IPV6_PMTUDISC_INTERFACE && pmtudisc != IPV6_PMTUDISC_OMIT; } static inline bool ip6_sk_ignore_df(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc); return pmtudisc < IPV6_PMTUDISC_DO || pmtudisc == IPV6_PMTUDISC_OMIT; } static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt, const struct in6_addr *daddr) { if (rt->rt6i_flags & RTF_GATEWAY) return &rt->rt6i_gateway; else if (unlikely(rt->rt6i_flags & RTF_CACHE)) return &rt->rt6i_dst.addr; else return daddr; } static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { struct fib6_nh *nha, *nhb; if (a->nh || b->nh) return nexthop_cmp(a->nh, b->nh); nha = a->fib6_nh; nhb = b->fib6_nh; return nha->fib_nh_dev == nhb->fib_nh_dev && ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) && !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws); } static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { struct inet6_dev *idev; unsigned int mtu; if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) { mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; } mtu = IPV6_MIN_MTU; rcu_read_lock(); idev = __in6_dev_get(dst->dev); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); #endif
2 2 254 254 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* * Compatibility functions which bloat the callers too much to make inline. * All of the callers of these functions should be converted to use folios * eventually. */ #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include "internal.h" void unlock_page(struct page *page) { return folio_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page); void end_page_writeback(struct page *page) { return folio_end_writeback(page_folio(page)); } EXPORT_SYMBOL(end_page_writeback); void wait_on_page_writeback(struct page *page) { return folio_wait_writeback(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_on_page_writeback); void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); } EXPORT_SYMBOL(mark_page_accessed); void set_page_writeback(struct page *page) { folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); bool set_page_dirty(struct page *page) { return folio_mark_dirty(page_folio(page)); } EXPORT_SYMBOL(set_page_dirty); int set_page_dirty_lock(struct page *page) { return folio_mark_dirty_lock(page_folio(page)); } EXPORT_SYMBOL(set_page_dirty_lock); bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); } EXPORT_SYMBOL(clear_page_dirty_for_io); bool redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { return folio_redirty_for_writepage(wbc, page_folio(page)); } EXPORT_SYMBOL(redirty_page_for_writepage); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { return filemap_add_folio(mapping, page_folio(page), index, gfp); } EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page);
6 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); return -EINVAL; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt) { if (!dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } return 0; } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); rcu_read_lock(); oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); rcu_read_unlock(); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; u32 taprio_flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL; } if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } /* Not going to race against advance_sched(), but still */ admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; rcu_read_lock(); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error_rcu; if (oper && dump_schedule(skb, oper)) goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error_rcu: rcu_read_unlock(); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc");
8 8 8 8 8 26 26 539 539 178 177 9 169 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005-2010 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> * Kylene Hall <kjhall@us.ibm.com> * * File: evm_main.c * implements evm_inode_setxattr, evm_inode_post_setxattr, * evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl. */ #define pr_fmt(fmt) "EVM: "fmt #include <linux/init.h> #include <linux/audit.h> #include <linux/xattr.h> #include <linux/integrity.h> #include <linux/evm.h> #include <linux/magic.h> #include <linux/posix_acl_xattr.h> #include <linux/lsm_hooks.h> #include <crypto/hash.h> #include <crypto/hash_info.h> #include <crypto/utils.h> #include "evm.h" int evm_initialized; static const char * const integrity_status_msg[] = { "pass", "pass_immutable", "fail", "fail_immutable", "no_label", "no_xattrs", "unknown" }; int evm_hmac_attrs; static struct xattr_list evm_config_default_xattrnames[] = { { .name = XATTR_NAME_SELINUX, .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX) }, { .name = XATTR_NAME_SMACK, .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK) }, { .name = XATTR_NAME_SMACKEXEC, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKTRANSMUTE, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_SMACKMMAP, .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS) }, { .name = XATTR_NAME_APPARMOR, .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR) }, { .name = XATTR_NAME_IMA, .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE) }, { .name = XATTR_NAME_CAPS, .enabled = true }, }; LIST_HEAD(evm_config_xattrnames); static int evm_fixmode __ro_after_init; static int __init evm_set_fixmode(char *str) { if (strncmp(str, "fix", 3) == 0) evm_fixmode = 1; else pr_err("invalid \"%s\" mode", str); return 1; } __setup("evm=", evm_set_fixmode); static void __init evm_init_config(void) { int i, xattrs; xattrs = ARRAY_SIZE(evm_config_default_xattrnames); pr_info("Initialising EVM extended attributes:\n"); for (i = 0; i < xattrs; i++) { pr_info("%s%s\n", evm_config_default_xattrnames[i].name, !evm_config_default_xattrnames[i].enabled ? " (disabled)" : ""); list_add_tail(&evm_config_default_xattrnames[i].list, &evm_config_xattrnames); } #ifdef CONFIG_EVM_ATTR_FSUUID evm_hmac_attrs |= EVM_ATTR_FSUUID; #endif pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs); } static bool evm_key_loaded(void) { return (bool)(evm_initialized & EVM_KEY_MASK); } /* * This function determines whether or not it is safe to ignore verification * errors, based on the ability of EVM to calculate HMACs. If the HMAC key * is not loaded, and it cannot be loaded in the future due to the * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the * attrs/xattrs being found invalid will not make them valid. */ static bool evm_hmac_disabled(void) { if (evm_initialized & EVM_INIT_HMAC) return false; if (!(evm_initialized & EVM_SETUP_COMPLETE)) return false; return true; } static int evm_find_protected_xattrs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct xattr_list *xattr; int error; int count = 0; if (!(inode->i_opflags & IOP_XATTR)) return -EOPNOTSUPP; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0); if (error < 0) { if (error == -ENODATA) continue; return error; } count++; } return count; } static int is_unsupported_hmac_fs(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) { pr_info_once("%s not supported\n", inode->i_sb->s_type->name); return 1; } return 0; } /* * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr * * Compute the HMAC on the dentry's protected set of extended attributes * and compare it against the stored security.evm xattr. * * For performance: * - use the previously retrieved xattr value and length to calculate the * HMAC.) * - cache the verification result in the iint, when available. * * Returns integrity status */ static enum integrity_status evm_verify_hmac(struct dentry *dentry, const char *xattr_name, char *xattr_value, size_t xattr_value_len) { struct evm_ima_xattr_data *xattr_data = NULL; struct signature_v2_hdr *hdr; enum integrity_status evm_status = INTEGRITY_PASS; struct evm_digest digest; struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); int rc, xattr_len, evm_immutable = 0; if (iint && (iint->evm_status == INTEGRITY_PASS || iint->evm_status == INTEGRITY_PASS_IMMUTABLE)) return iint->evm_status; /* * On unsupported filesystems without EVM_INIT_X509 enabled, skip * signature verification. */ if (!(evm_initialized & EVM_INIT_X509) && is_unsupported_hmac_fs(dentry)) return INTEGRITY_UNKNOWN; /* if status is not PASS, try to check again - against -ENOMEM */ /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) { evm_status = INTEGRITY_FAIL; if (rc == -ENODATA) { rc = evm_find_protected_xattrs(dentry); if (rc > 0) evm_status = INTEGRITY_NOLABEL; else if (rc == 0) evm_status = INTEGRITY_NOXATTRS; /* new file */ } else if (rc == -EOPNOTSUPP) { evm_status = INTEGRITY_UNKNOWN; } goto out; } xattr_len = rc; /* check value type */ switch (xattr_data->type) { case EVM_XATTR_HMAC: if (xattr_len != sizeof(struct evm_xattr)) { evm_status = INTEGRITY_FAIL; goto out; } digest.hdr.algo = HASH_ALGO_SHA1; rc = evm_calc_hmac(dentry, xattr_name, xattr_value, xattr_value_len, &digest, iint); if (rc) break; rc = crypto_memneq(xattr_data->data, digest.digest, SHA1_DIGEST_SIZE); if (rc) rc = -EINVAL; break; case EVM_XATTR_PORTABLE_DIGSIG: evm_immutable = 1; fallthrough; case EVM_IMA_XATTR_DIGSIG: /* accept xattr with non-empty signature field */ if (xattr_len <= sizeof(struct signature_v2_hdr)) { evm_status = INTEGRITY_FAIL; goto out; } hdr = (struct signature_v2_hdr *)xattr_data; digest.hdr.algo = hdr->hash_algo; rc = evm_calc_hash(dentry, xattr_name, xattr_value, xattr_value_len, xattr_data->type, &digest, iint); if (rc) break; rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM, (const char *)xattr_data, xattr_len, digest.digest, digest.hdr.length); if (!rc) { if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) { if (iint) iint->flags |= EVM_IMMUTABLE_DIGSIG; evm_status = INTEGRITY_PASS_IMMUTABLE; } else if (!IS_RDONLY(inode) && !(inode->i_sb->s_readonly_remount) && !IS_IMMUTABLE(inode) && !is_unsupported_hmac_fs(dentry)) { evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } } break; default: rc = -EINVAL; break; } if (rc) { if (rc == -ENODATA) evm_status = INTEGRITY_NOXATTRS; else if (evm_immutable) evm_status = INTEGRITY_FAIL_IMMUTABLE; else evm_status = INTEGRITY_FAIL; } pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length, digest.digest); out: if (iint) iint->evm_status = evm_status; kfree(xattr_data); return evm_status; } static int evm_protected_xattr_common(const char *req_xattr_name, bool all_xattrs) { int namelen; int found = 0; struct xattr_list *xattr; namelen = strlen(req_xattr_name); list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { if (!all_xattrs && !xattr->enabled) continue; if ((strlen(xattr->name) == namelen) && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) { found = 1; break; } if (strncmp(req_xattr_name, xattr->name + XATTR_SECURITY_PREFIX_LEN, strlen(req_xattr_name)) == 0) { found = 1; break; } } return found; } int evm_protected_xattr(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, false); } int evm_protected_xattr_if_enabled(const char *req_xattr_name) { return evm_protected_xattr_common(req_xattr_name, true); } /** * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values * @dentry: dentry of the read xattrs * @buffer: buffer xattr names, lengths or values are copied to * @buffer_size: size of buffer * @type: n: names, l: lengths, v: values * @canonical_fmt: data format (true: little endian, false: native format) * * Read protected xattr names (separated by |), lengths (u32) or values for a * given dentry and return the total size of copied data. If buffer is NULL, * just return the total size. * * Returns the total size on success, a negative value on error. */ int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer, int buffer_size, char type, bool canonical_fmt) { struct xattr_list *xattr; int rc, size, total_size = 0; list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, NULL, 0); if (rc < 0 && rc == -ENODATA) continue; else if (rc < 0) return rc; switch (type) { case 'n': size = strlen(xattr->name) + 1; if (buffer) { if (total_size) *(buffer + total_size - 1) = '|'; memcpy(buffer + total_size, xattr->name, size); } break; case 'l': size = sizeof(u32); if (buffer) { if (canonical_fmt) rc = (__force int)cpu_to_le32(rc); *(u32 *)(buffer + total_size) = rc; } break; case 'v': size = rc; if (buffer) { rc = __vfs_getxattr(dentry, d_backing_inode(dentry), xattr->name, buffer + total_size, buffer_size - total_size); if (rc < 0) return rc; } break; default: return -EINVAL; } total_size += size; } return total_size; } /** * evm_verifyxattr - verify the integrity of the requested xattr * @dentry: object of the verify xattr * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Calculate the HMAC for the given dentry and verify it against the stored * security.evm xattr. For performance, use the xattr value and length * previously retrieved to calculate the HMAC. * * Returns the xattr integrity status. * * This function requires the caller to lock the inode's i_mutex before it * is executed. */ enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len) { if (!evm_key_loaded() || !evm_protected_xattr(xattr_name)) return INTEGRITY_UNKNOWN; return evm_verify_hmac(dentry, xattr_name, xattr_value, xattr_value_len); } EXPORT_SYMBOL_GPL(evm_verifyxattr); /* * evm_verify_current_integrity - verify the dentry's metadata integrity * @dentry: pointer to the affected dentry * * Verify and return the dentry's metadata integrity. The exceptions are * before EVM is initialized or in 'fix' mode. */ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode) return INTEGRITY_PASS; return evm_verify_hmac(dentry, NULL, NULL, 0); } /* * evm_xattr_change - check if passed xattr value differs from current value * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: requested xattr * @xattr_value: requested xattr value * @xattr_value_len: requested xattr value length * * Check if passed xattr value differs from current value. * * Returns 1 if passed xattr value differs from current value, 0 otherwise. */ static int evm_xattr_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { char *xattr_data = NULL; int rc = 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data, 0, GFP_NOFS); if (rc < 0) { rc = 1; goto out; } if (rc == xattr_value_len) rc = !!memcmp(xattr_value, xattr_data, rc); else rc = 1; out: kfree(xattr_data); return rc; } /* * evm_protect_xattr - protect the EVM extended attribute * * Prevent security.evm from being modified or removed without the * necessary permissions or when the existing value is invalid. * * The posix xattr acls are 'system' prefixed, which normally would not * affect security.evm. An interesting side affect of writing posix xattr * acls is their modifying of the i_mode, which is included in security.evm. * For posix xattr acls only, permit security.evm, even if it currently * doesn't exist, to be updated unless the EVM signature is immutable. */ static int evm_protect_xattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { enum integrity_status evm_status; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (is_unsupported_hmac_fs(dentry)) return -EPERM; } else if (!evm_protected_xattr(xattr_name)) { if (!posix_xattr_acl(xattr_name)) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; goto out; } else if (is_unsupported_hmac_fs(dentry)) return 0; evm_status = evm_verify_current_integrity(dentry); if (evm_status == INTEGRITY_NOXATTRS) { struct evm_iint_cache *iint; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled()) return 0; iint = evm_iint_inode(d_backing_inode(dentry)); if (iint && (iint->flags & EVM_NEW_FILE)) return 0; /* exception for pseudo filesystems */ if (dentry->d_sb->s_magic == TMPFS_MAGIC || dentry->d_sb->s_magic == SYSFS_MAGIC) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, dentry->d_inode, dentry->d_name.name, "update_metadata", integrity_status_msg[evm_status], -EPERM, 0); } out: /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_xattr_change(idmap, dentry, xattr_name, xattr_value, xattr_value_len)) return 0; if (evm_status != INTEGRITY_PASS && evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return evm_status == INTEGRITY_PASS ? 0 : -EPERM; } /** * evm_inode_setxattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Before allowing the 'security.evm' protected xattr to be updated, * verify the existing value is valid. As only the kernel should have * access to the EVM encrypted key needed to calculate the HMAC, prevent * userspace from writing HMAC value. Writing 'security.evm' requires * requires CAP_SYS_ADMIN privileges. */ static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xattr_data = xattr_value; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) { if (!xattr_value_len) return -EINVAL; if (xattr_data->type != EVM_IMA_XATTR_DIGSIG && xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) return -EPERM; } return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_removexattr - protect the EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that * the current value is valid. */ static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0); } #ifdef CONFIG_FS_POSIX_ACL static int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { int rc; umode_t mode; struct inode *inode = d_backing_inode(dentry); if (!kacl) return 1; rc = posix_acl_update_mode(idmap, inode, &mode, &kacl); if (rc || (inode->i_mode != mode)) return 1; return 0; } #else static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *kacl) { return 0; } #endif /** * evm_inode_set_acl - protect the EVM extended attribute from posix acls * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Prevent modifying posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { enum integrity_status evm_status; /* Policy permits modification of the protected xattrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; evm_status = evm_verify_current_integrity(dentry); if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; /* Exception if the HMAC is not going to be calculated. */ if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN)) return 0; /* * Writing other xattrs is safe for portable signatures, as portable * signatures are immutable and can never be updated. */ if (evm_status == INTEGRITY_FAIL_IMMUTABLE) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl)) return 0; if (evm_status != INTEGRITY_PASS_IMMUTABLE) integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Prevent removing posix acls causing the EVM HMAC to be re-calculated * and 'security.evm' xattr updated, unless the existing 'security.evm' is * valid. * * Return: zero on success, -EPERM on failure. */ static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return evm_inode_set_acl(idmap, dentry, acl_name, NULL); } static void evm_reset_status(struct inode *inode) { struct evm_iint_cache *iint; iint = evm_iint_inode(inode); if (iint) iint->evm_status = INTEGRITY_UNKNOWN; } /** * evm_metadata_changed: Detect changes to the metadata * @inode: a file's inode * @metadata_inode: metadata inode * * On a stacked filesystem detect whether the metadata has changed. If this is * the case reset the evm_status associated with the inode that represents the * file. */ bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); bool ret = false; if (iint) { ret = (!IS_I_VERSION(metadata_inode) || integrity_inode_attrs_changed(&iint->metadata_inode, metadata_inode)); if (ret) iint->evm_status = INTEGRITY_UNKNOWN; } return ret; } /** * evm_revalidate_status - report whether EVM status re-validation is necessary * @xattr_name: pointer to the affected extended attribute name * * Report whether callers of evm_verifyxattr() should re-validate the * EVM status. * * Return true if re-validation is necessary, false otherwise. */ bool evm_revalidate_status(const char *xattr_name) { if (!evm_key_loaded()) return false; /* evm_inode_post_setattr() passes NULL */ if (!xattr_name) return true; if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) && strcmp(xattr_name, XATTR_NAME_EVM)) return false; return true; } /** * evm_inode_post_setxattr - update 'security.evm' to reflect the changes * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * @xattr_value: pointer to the new extended attribute value * @xattr_value_len: pointer to the new extended attribute value length * @flags: flags to pass into filesystem operations * * Update the HMAC stored in 'security.evm' to reflect the change. * * No need to take the i_mutex lock here, as this function is called from * __vfs_setxattr_noperm(). The caller of which has taken the inode's * i_mutex lock. */ static void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); } /** * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * @kacl: pointer to the posix acls * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting * posix acls. */ static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0); } /** * evm_inode_post_removexattr - update 'security.evm' after removing the xattr * @dentry: pointer to the affected dentry * @xattr_name: pointer to the affected extended attribute name * * Update the HMAC stored in 'security.evm' to reflect removal of the xattr. * * No need to take the i_mutex lock here, as this function is called from * vfs_removexattr() which takes the i_mutex. */ static void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) { if (!evm_revalidate_status(xattr_name)) return; evm_reset_status(dentry->d_inode); if (!strcmp(xattr_name, XATTR_NAME_EVM)) return; if (!(evm_initialized & EVM_INIT_HMAC)) return; evm_update_evmxattr(dentry, xattr_name, NULL, 0); } /** * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @acl_name: name of the posix acl * * Update the 'security.evm' xattr with the EVM HMAC re-calculated after * removing posix acls. */ static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { evm_inode_post_removexattr(dentry, acl_name); } static int evm_attr_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_backing_inode(dentry); unsigned int ia_valid = attr->ia_valid; if (!i_uid_needs_update(idmap, attr, inode) && !i_gid_needs_update(idmap, attr, inode) && (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode)) return 0; return 1; } /** * evm_inode_setattr - prevent updating an invalid EVM extended attribute * @idmap: idmap of the mount * @dentry: pointer to the affected dentry * @attr: iattr structure containing the new file attributes * * Permit update of file attributes when files have a valid EVM signature, * except in the case of them having an immutable portable signature. */ static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; enum integrity_status evm_status; /* Policy permits modification of the protected attrs even though * there's no HMAC key loaded */ if (evm_initialized & EVM_ALLOW_METADATA_WRITES) return 0; if (is_unsupported_hmac_fs(dentry)) return 0; if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return 0; evm_status = evm_verify_current_integrity(dentry); /* * Writing attrs is safe for portable signatures, as portable signatures * are immutable and can never be updated. */ if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS) || (evm_status == INTEGRITY_FAIL_IMMUTABLE) || (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL || evm_status == INTEGRITY_UNKNOWN))) return 0; if (evm_status == INTEGRITY_PASS_IMMUTABLE && !evm_attr_change(idmap, dentry, attr)) return 0; integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; } /** * evm_inode_post_setattr - update 'security.evm' after modifying metadata * @idmap: idmap of the idmapped mount * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * For now, update the HMAC stored in 'security.evm' to reflect UID/GID * changes. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void evm_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { if (!evm_revalidate_status(NULL)) return; evm_reset_status(dentry->d_inode); if (!(evm_initialized & EVM_INIT_HMAC)) return; if (is_unsupported_hmac_fs(dentry)) return; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) evm_update_evmxattr(dentry, NULL, NULL, 0); } static int evm_inode_copy_up_xattr(struct dentry *src, const char *name) { struct evm_ima_xattr_data *xattr_data = NULL; int rc; if (strcmp(name, XATTR_NAME_EVM) != 0) return -EOPNOTSUPP; /* first need to know the sig type */ rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0) return -EPERM; if (rc < offsetof(struct evm_ima_xattr_data, type) + sizeof(xattr_data->type)) return -EPERM; switch (xattr_data->type) { case EVM_XATTR_PORTABLE_DIGSIG: rc = 0; /* allow copy-up */ break; case EVM_XATTR_HMAC: case EVM_IMA_XATTR_DIGSIG: default: rc = -ECANCELED; /* discard */ } kfree(xattr_data); return rc; } /* * evm_inode_init_security - initializes security.evm HMAC value */ int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { struct evm_xattr *xattr_data; struct xattr *xattr, *evm_xattr; bool evm_protected_xattrs = false; int rc; if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs) return 0; /* * security_inode_init_security() makes sure that the xattrs array is * contiguous, there is enough space for security.evm, and that there is * a terminator at the end of the array. */ for (xattr = xattrs; xattr->name; xattr++) { if (evm_protected_xattr(xattr->name)) evm_protected_xattrs = true; } /* EVM xattr not needed. */ if (!evm_protected_xattrs) return 0; evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count); /* * Array terminator (xattr name = NULL) must be the first non-filled * xattr slot. */ WARN_ONCE(evm_xattr != xattr, "%s: xattrs terminator is not the first non-filled slot\n", __func__); xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS); if (!xattr_data) return -ENOMEM; xattr_data->data.type = EVM_XATTR_HMAC; rc = evm_init_hmac(inode, xattrs, xattr_data->digest); if (rc < 0) goto out; evm_xattr->value = xattr_data; evm_xattr->value_len = sizeof(*xattr_data); evm_xattr->name = XATTR_EVM_SUFFIX; return 0; out: kfree(xattr_data); return rc; } EXPORT_SYMBOL_GPL(evm_inode_init_security); static int evm_inode_alloc_security(struct inode *inode) { struct evm_iint_cache *iint = evm_iint_inode(inode); /* Called by security_inode_alloc(), it cannot be NULL. */ iint->flags = 0UL; iint->evm_status = INTEGRITY_UNKNOWN; return 0; } static void evm_file_release(struct file *file) { struct inode *inode = file_inode(file); struct evm_iint_cache *iint = evm_iint_inode(inode); fmode_t mode = file->f_mode; if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE)) return; if (iint && iint->flags & EVM_NEW_FILE && atomic_read(&inode->i_writecount) == 1) iint->flags &= ~EVM_NEW_FILE; } static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); struct evm_iint_cache *iint = evm_iint_inode(inode); if (!S_ISREG(inode->i_mode)) return; if (iint) iint->flags |= EVM_NEW_FILE; } #ifdef CONFIG_EVM_LOAD_X509 void __init evm_load_x509(void) { int rc; rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH); if (!rc) evm_initialized |= EVM_INIT_X509; } #endif static int __init init_evm(void) { int error; struct list_head *pos, *q; evm_init_config(); error = integrity_init_keyring(INTEGRITY_KEYRING_EVM); if (error) goto error; error = evm_init_secfs(); if (error < 0) { pr_info("Error registering secfs\n"); goto error; } error: if (error != 0) { if (!list_empty(&evm_config_xattrnames)) { list_for_each_safe(pos, q, &evm_config_xattrnames) list_del(pos); } } return error; } static struct security_hook_list evm_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_setattr, evm_inode_setattr), LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr), LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr), LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr), LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr), LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl), LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl), LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl), LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl), LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr), LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr), LSM_HOOK_INIT(inode_init_security, evm_inode_init_security), LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security), LSM_HOOK_INIT(file_release, evm_file_release), LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod), }; static const struct lsm_id evm_lsmid = { .name = "evm", .id = LSM_ID_EVM, }; static int __init init_evm_lsm(void) { security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid); return 0; } struct lsm_blob_sizes evm_blob_sizes __ro_after_init = { .lbs_inode = sizeof(struct evm_iint_cache), .lbs_xattr_count = 1, }; DEFINE_LSM(evm) = { .name = "evm", .init = init_evm_lsm, .order = LSM_ORDER_LAST, .blobs = &evm_blob_sizes, }; late_initcall(init_evm);
18 21 21 21 16 16 12 10 12 10 14 14 14 14 14 1 14 16 14 14 16 16 2 16 16 21 21 21 21 15 13 21 21 21 21 21 21 16 20 20 20 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-only #undef DEBUG /* * ARM performance counter support. * * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com> * * This code is based on the sparc64 perf event code, which is in turn based * on the x86 code. */ #define pr_fmt(fmt) "hw perfevents: " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/cpu_pm.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/perf/arm_pmu.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/spinlock.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <asm/irq_regs.h> static int armpmu_count_irq_users(const int irq); struct pmu_irq_ops { void (*enable_pmuirq)(unsigned int irq); void (*disable_pmuirq)(unsigned int irq); void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid); }; static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { free_irq(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmuirq_ops = { .enable_pmuirq = enable_irq, .disable_pmuirq = disable_irq_nosync, .free_pmuirq = armpmu_free_pmuirq }; static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { free_nmi(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmunmi_ops = { .enable_pmuirq = enable_nmi, .disable_pmuirq = disable_nmi_nosync, .free_pmuirq = armpmu_free_pmunmi }; static void armpmu_enable_percpu_pmuirq(unsigned int irq) { enable_percpu_irq(irq, IRQ_TYPE_NONE); } static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_irq(irq, devid); } static const struct pmu_irq_ops percpu_pmuirq_ops = { .enable_pmuirq = armpmu_enable_percpu_pmuirq, .disable_pmuirq = disable_percpu_irq, .free_pmuirq = armpmu_free_percpu_pmuirq }; static void armpmu_enable_percpu_pmunmi(unsigned int irq) { if (!prepare_percpu_nmi(irq)) enable_percpu_nmi(irq, IRQ_TYPE_NONE); } static void armpmu_disable_percpu_pmunmi(unsigned int irq) { disable_percpu_nmi(irq); teardown_percpu_nmi(irq); } static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_nmi(irq, devid); } static const struct pmu_irq_ops percpu_pmunmi_ops = { .enable_pmuirq = armpmu_enable_percpu_pmunmi, .disable_pmuirq = armpmu_disable_percpu_pmunmi, .free_pmuirq = armpmu_free_percpu_pmunmi }; static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); static bool has_nmi; static inline u64 arm_pmu_event_max_period(struct perf_event *event) { if (event->hw.flags & ARMPMU_EVT_64BIT) return GENMASK_ULL(63, 0); else if (event->hw.flags & ARMPMU_EVT_63BIT) return GENMASK_ULL(62, 0); else if (event->hw.flags & ARMPMU_EVT_47BIT) return GENMASK_ULL(46, 0); else return GENMASK_ULL(31, 0); } static int armpmu_map_cache_event(const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u64 config) { unsigned int cache_type, cache_op, cache_result, ret; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; if (!cache_map) return -ENOENT; ret = (int)(*cache_map)[cache_type][cache_op][cache_result]; if (ret == CACHE_OP_UNSUPPORTED) return -ENOENT; return ret; } static int armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config) { int mapping; if (config >= PERF_COUNT_HW_MAX) return -EINVAL; if (!event_map) return -ENOENT; mapping = (*event_map)[config]; return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; } static int armpmu_map_raw_event(u32 raw_event_mask, u64 config) { return (int)(config & raw_event_mask); } int armpmu_map_event(struct perf_event *event, const unsigned (*event_map)[PERF_COUNT_HW_MAX], const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask) { u64 config = event->attr.config; int type = event->attr.type; if (type == event->pmu->type) return armpmu_map_raw_event(raw_event_mask, config); switch (type) { case PERF_TYPE_HARDWARE: return armpmu_map_hw_event(event_map, config); case PERF_TYPE_HW_CACHE: return armpmu_map_cache_event(cache_map, config); case PERF_TYPE_RAW: return armpmu_map_raw_event(raw_event_mask, config); } return -ENOENT; } int armpmu_event_set_period(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; u64 max_period; int ret = 0; max_period = arm_pmu_event_max_period(event); if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Limit the maximum period to prevent the counter value * from overtaking the one we are about to program. In * effect we are reducing max_period to account for * interrupt latency (and we are being very conservative). */ if (left > (max_period >> 1)) left = (max_period >> 1); local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(event, (u64)(-left) & max_period); perf_event_update_userpage(event); return ret; } u64 armpmu_event_update(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u64 delta, prev_raw_count, new_raw_count; u64 max_period = arm_pmu_event_max_period(event); again: prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(event); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count - prev_raw_count) & max_period; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } static void armpmu_read(struct perf_event *event) { armpmu_event_update(event); } static void armpmu_stop(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to update the counter, so ignore * PERF_EF_UPDATE, see comments in armpmu_start(). */ if (!(hwc->state & PERF_HES_STOPPED)) { armpmu->disable(event); armpmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } } static void armpmu_start(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to reprogram the period, so ignore * PERF_EF_RELOAD, see the comment below. */ if (flags & PERF_EF_RELOAD) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; /* * Set the period again. Some counters can't be stopped, so when we * were stopped we simply disabled the IRQ source and the counter * may have been left counting. If we don't do this step then we may * get an interrupt too soon or *way* too late if the overflow has * happened since disabling. */ armpmu_event_set_period(event); armpmu->enable(event); } static void armpmu_del(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; armpmu_stop(event, PERF_EF_UPDATE); hw_events->events[idx] = NULL; armpmu->clear_event_idx(hw_events, event); perf_event_update_userpage(event); /* Clear the allocated counter */ hwc->idx = -1; } static int armpmu_add(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx; /* An event following a process won't be stopped earlier */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return -ENOENT; /* If we don't have a space for the counter then finish early. */ idx = armpmu->get_event_idx(hw_events, event); if (idx < 0) return idx; /* The newly-allocated counter should be empty */ WARN_ON_ONCE(hw_events->events[idx]); event->hw.idx = idx; hw_events->events[idx] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (flags & PERF_EF_START) armpmu_start(event, PERF_EF_RELOAD); /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); return 0; } static int validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events, struct perf_event *event) { struct arm_pmu *armpmu; if (is_software_event(event)) return 1; /* * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The * core perf code won't check that the pmu->ctx == leader->ctx * until after pmu->event_init(event). */ if (event->pmu != pmu) return 0; if (event->state < PERF_EVENT_STATE_OFF) return 1; if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) return 1; armpmu = to_arm_pmu(event->pmu); return armpmu->get_event_idx(hw_events, event) >= 0; } static int validate_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; struct pmu_hw_events fake_pmu; /* * Initialise the fake PMU. We only need to populate the * used_mask for the purposes of validation. */ memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask)); if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; if (event == leader) return 0; for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } if (!validate_event(event->pmu, &fake_pmu, event)) return -EINVAL; return 0; } static irqreturn_t armpmu_dispatch_irq(int irq, void *dev) { struct arm_pmu *armpmu; int ret; u64 start_clock, finish_clock; /* * we request the IRQ with a (possibly percpu) struct arm_pmu**, but * the handlers expect a struct arm_pmu*. The percpu_irq framework will * do any necessary shifting, we just need to perform the first * dereference. */ armpmu = *(void **)dev; if (WARN_ON_ONCE(!armpmu)) return IRQ_NONE; start_clock = sched_clock(); ret = armpmu->handle_irq(armpmu); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } static int __hw_perf_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int mapping, ret; hwc->flags = 0; mapping = armpmu->map_event(event); if (mapping < 0) { pr_debug("event %x:%llx not supported\n", event->attr.type, event->attr.config); return mapping; } /* * We don't assign an index until we actually place the event onto * hardware. Use -1 to signify that we haven't decided where to put it * yet. For SMP systems, each core has it's own PMU so we can't do any * clever allocation or constraints checking at this point. */ hwc->idx = -1; hwc->config_base = 0; hwc->config = 0; hwc->event_base = 0; /* * Check whether we need to exclude the counter from certain modes. */ if (armpmu->set_event_filter) { ret = armpmu->set_event_filter(hwc, &event->attr); if (ret) return ret; } /* * Store the event encoding into the config_base field. */ hwc->config_base |= (unsigned long)mapping; if (!is_sampling_event(event)) { /* * For non-sampling runs, limit the sample_period to half * of the counter width. That way, the new counter value * is far less likely to overtake the previous one unless * you have some serious IRQ latency issues. */ hwc->sample_period = arm_pmu_event_max_period(event) >> 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } return validate_group(event); } static int armpmu_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); /* * Reject CPU-affine events for CPUs that are of a different class to * that which this PMU handles. Process-following events (where * event->cpu == -1) can be migrated between CPUs, and thus we have to * reject them later (in armpmu_add) if they're scheduled on a * different class of CPU. */ if (event->cpu != -1 && !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus)) return -ENOENT; /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; return __hw_perf_event_init(event); } static void armpmu_enable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; if (enabled) armpmu->start(armpmu); } static void armpmu_disable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; armpmu->stop(armpmu); } /* * In heterogeneous systems, events are specific to a particular * microarchitecture, and aren't suitable for another. Thus, only match CPUs of * the same microarchitecture. */ static bool armpmu_filter(struct pmu *pmu, int cpu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); return !cpumask_test_cpu(cpu, &armpmu->supported_cpus); } static ssize_t cpus_show(struct device *dev, struct device_attribute *attr, char *buf) { struct arm_pmu *armpmu = to_arm_pmu(dev_get_drvdata(dev)); return cpumap_print_to_pagebuf(true, buf, &armpmu->supported_cpus); } static DEVICE_ATTR_RO(cpus); static struct attribute *armpmu_common_attrs[] = { &dev_attr_cpus.attr, NULL, }; static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; static int armpmu_count_irq_users(const int irq) { int cpu, count = 0; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) == irq) count++; } return count; } static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) { const struct pmu_irq_ops *ops = NULL; int cpu; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) != irq) continue; ops = per_cpu(cpu_irq_ops, cpu); if (ops) break; } return ops; } void armpmu_free_irq(int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) return; if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); per_cpu(cpu_irq, cpu) = 0; per_cpu(cpu_irq_ops, cpu) = NULL; } int armpmu_request_irq(int irq, int cpu) { int err = 0; const irq_handler_t handler = armpmu_dispatch_irq; const struct pmu_irq_ops *irq_ops; if (!irq) return 0; if (!irq_is_percpu_devid(irq)) { unsigned long irq_flags; err = irq_force_affinity(irq, cpumask_of(cpu)); if (err && num_possible_cpus() > 1) { pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n", irq, cpu); goto err_out; } irq_flags = IRQF_PERCPU | IRQF_NOBALANCING | IRQF_NO_AUTOEN | IRQF_NO_THREAD; err = request_nmi(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); irq_ops = &pmuirq_ops; } else { has_nmi = true; irq_ops = &pmunmi_ops; } } else if (armpmu_count_irq_users(irq) == 0) { err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_percpu_irq(irq, handler, "arm-pmu", &cpu_armpmu); irq_ops = &percpu_pmuirq_ops; } else { has_nmi = true; irq_ops = &percpu_pmunmi_ops; } } else { /* Per cpudevid irq was already requested by another CPU */ irq_ops = armpmu_find_irq_ops(irq); if (WARN_ON(!irq_ops)) err = -EINVAL; } if (err) goto err_out; per_cpu(cpu_irq, cpu) = irq; per_cpu(cpu_irq_ops, cpu) = irq_ops; return 0; err_out: pr_err("unable to request IRQ%d for ARM PMU counters\n", irq); return err; } static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu) { struct pmu_hw_events __percpu *hw_events = pmu->hw_events; return per_cpu(hw_events->irq, cpu); } bool arm_pmu_irq_is_nmi(void) { return has_nmi; } /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading * junk values out of them. */ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; if (pmu->reset) pmu->reset(pmu); per_cpu(cpu_armpmu, cpu) = pmu; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); return 0; } static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); per_cpu(cpu_armpmu, cpu) = NULL; return 0; } #ifdef CONFIG_CPU_PM static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) { struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct perf_event *event; int idx; for_each_set_bit(idx, armpmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { event = hw_events->events[idx]; if (!event) continue; switch (cmd) { case CPU_PM_ENTER: /* * Stop and update the counter */ armpmu_stop(event, PERF_EF_UPDATE); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: /* * Restore and enable the counter. */ armpmu_start(event, PERF_EF_RELOAD); break; default: break; } } } static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, void *v) { struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return NOTIFY_DONE; /* * Always reset the PMU registers on power-up even if * there are no events running. */ if (cmd == CPU_PM_EXIT && armpmu->reset) armpmu->reset(armpmu); if (!enabled) return NOTIFY_OK; switch (cmd) { case CPU_PM_ENTER: armpmu->stop(armpmu); cpu_pm_pmu_setup(armpmu, cmd); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: cpu_pm_pmu_setup(armpmu, cmd); armpmu->start(armpmu); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify; return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb); } static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb); } #else static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; } static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { } #endif static int cpu_pmu_init(struct arm_pmu *cpu_pmu) { int err; err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); if (err) goto out; err = cpu_pm_pmu_register(cpu_pmu); if (err) goto out_unregister; return 0; out_unregister: cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); out: return err; } static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu) { cpu_pm_pmu_unregister(cpu_pmu); cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); } struct arm_pmu *armpmu_alloc(void) { struct arm_pmu *pmu; int cpu; pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); if (!pmu) goto out; pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL); if (!pmu->hw_events) { pr_info("failed to allocate per-cpu PMU data.\n"); goto out_free_pmu; } pmu->pmu = (struct pmu) { .pmu_enable = armpmu_enable, .pmu_disable = armpmu_disable, .event_init = armpmu_event_init, .add = armpmu_add, .del = armpmu_del, .start = armpmu_start, .stop = armpmu_stop, .read = armpmu_read, .filter = armpmu_filter, .attr_groups = pmu->attr_groups, /* * This is a CPU PMU potentially in a heterogeneous * configuration (e.g. big.LITTLE) so * PERF_PMU_CAP_EXTENDED_HW_TYPE is required to open * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a * specific PMU. */ .capabilities = PERF_PMU_CAP_EXTENDED_REGS | PERF_PMU_CAP_EXTENDED_HW_TYPE, }; pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] = &armpmu_common_attr_group; for_each_possible_cpu(cpu) { struct pmu_hw_events *events; events = per_cpu_ptr(pmu->hw_events, cpu); events->percpu_pmu = pmu; } return pmu; out_free_pmu: kfree(pmu); out: return NULL; } void armpmu_free(struct arm_pmu *pmu) { free_percpu(pmu->hw_events); kfree(pmu); } int armpmu_register(struct arm_pmu *pmu) { int ret; ret = cpu_pmu_init(pmu); if (ret) return ret; if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (ret) goto out_destroy; pr_info("enabled with %s PMU driver, %d (%*pb) counters available%s\n", pmu->name, bitmap_weight(pmu->cntr_mask, ARMPMU_MAX_HWEVENTS), ARMPMU_MAX_HWEVENTS, &pmu->cntr_mask, has_nmi ? ", using NMIs" : ""); kvm_host_pmu_init(pmu); return 0; out_destroy: cpu_pmu_destroy(pmu); return ret; } static int arm_pmu_hp_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING, "perf/arm/pmu:starting", arm_perf_starting_cpu, arm_perf_teardown_cpu); if (ret) pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n", ret); return ret; } subsys_initcall(arm_pmu_hp_init);
296 296 295 296 4 4 4 296 296 295 295 4 26 26 26 26 26 26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013-2017 ARM Limited, All Rights Reserved. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #define pr_fmt(fmt) "GICv3: " fmt #include <linux/acpi.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> #include <linux/percpu.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/iopoll.h> #include <linux/irqchip.h> #include <linux/irqchip/arm-gic-common.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/irqchip/arm-gic-v3-prio.h> #include <linux/irqchip/irq-partition-percpu.h> #include <linux/bitfield.h> #include <linux/bits.h> #include <linux/arm-smccc.h> #include <asm/cputype.h> #include <asm/exception.h> #include <asm/smp_plat.h> #include <asm/virt.h> #include "irq-gic-common.h" static u8 dist_prio_irq __ro_after_init = GICV3_PRIO_IRQ; static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI; #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) #define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1) #define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2) #define FLAGS_WORKAROUND_INSECURE (1ULL << 3) #define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) static struct cpumask broken_rdists __read_mostly __maybe_unused; struct redist_region { void __iomem *redist_base; phys_addr_t phys_base; bool single_redist; }; struct gic_chip_data { struct fwnode_handle *fwnode; phys_addr_t dist_phys_base; void __iomem *dist_base; struct redist_region *redist_regions; struct rdists rdists; struct irq_domain *domain; u64 redist_stride; u32 nr_redist_regions; u64 flags; bool has_rss; unsigned int ppi_nr; struct partition_desc **ppi_descs; }; #define T241_CHIPS_MAX 4 static void __iomem *t241_dist_base_alias[T241_CHIPS_MAX] __read_mostly; static DEFINE_STATIC_KEY_FALSE(gic_nvidia_t241_erratum); static DEFINE_STATIC_KEY_FALSE(gic_arm64_2941627_erratum); static struct gic_chip_data gic_data __read_mostly; static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); #define GIC_ID_NR (1U << GICD_TYPER_ID_BITS(gic_data.rdists.gicd_typer)) #define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) #define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer) static bool nmi_support_forbidden; /* * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs * are potentially stolen by the secure side. Some code, especially code dealing * with hwirq IDs, is simplified by accounting for all 16. */ #define SGI_NR 16 /* * The behaviours of RPR and PMR registers differ depending on the value of * SCR_EL3.FIQ, and the behaviour of non-secure priority registers of the * distributor and redistributors depends on whether security is enabled in the * GIC. * * When security is enabled, non-secure priority values from the (re)distributor * are presented to the GIC CPUIF as follow: * (GIC_(R)DIST_PRI[irq] >> 1) | 0x80; * * If SCR_EL3.FIQ == 1, the values written to/read from PMR and RPR at non-secure * EL1 are subject to a similar operation thus matching the priorities presented * from the (re)distributor when security is enabled. When SCR_EL3.FIQ == 0, * these values are unchanged by the GIC. * * see GICv3/GICv4 Architecture Specification (IHI0069D): * - section 4.8.1 Non-secure accesses to register fields for Secure interrupt * priorities. * - Figure 4-7 Secure read of the priority field for a Non-secure Group 1 * interrupt. */ static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis); static u32 gic_get_pribits(void) { u32 pribits; pribits = gic_read_ctlr(); pribits &= ICC_CTLR_EL1_PRI_BITS_MASK; pribits >>= ICC_CTLR_EL1_PRI_BITS_SHIFT; pribits++; return pribits; } static bool gic_has_group0(void) { u32 val; u32 old_pmr; old_pmr = gic_read_pmr(); /* * Let's find out if Group0 is under control of EL3 or not by * setting the highest possible, non-zero priority in PMR. * * If SCR_EL3.FIQ is set, the priority gets shifted down in * order for the CPU interface to set bit 7, and keep the * actual priority in the non-secure range. In the process, it * looses the least significant bit and the actual priority * becomes 0x80. Reading it back returns 0, indicating that * we're don't have access to Group0. */ gic_write_pmr(BIT(8 - gic_get_pribits())); val = gic_read_pmr(); gic_write_pmr(old_pmr); return val != 0; } static inline bool gic_dist_security_disabled(void) { return readl_relaxed(gic_data.dist_base + GICD_CTLR) & GICD_CTLR_DS; } static bool cpus_have_security_disabled __ro_after_init; static bool cpus_have_group0 __ro_after_init; static void __init gic_prio_init(void) { bool ds; cpus_have_group0 = gic_has_group0(); ds = gic_dist_security_disabled(); if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) { if (cpus_have_group0) { u32 val; val = readl_relaxed(gic_data.dist_base + GICD_CTLR); val |= GICD_CTLR_DS; writel_relaxed(val, gic_data.dist_base + GICD_CTLR); ds = gic_dist_security_disabled(); if (ds) pr_warn("Broken GIC integration, security disabled\n"); } else { pr_warn("Broken GIC integration, pNMI forbidden\n"); nmi_support_forbidden = true; } } cpus_have_security_disabled = ds; /* * How priority values are used by the GIC depends on two things: * the security state of the GIC (controlled by the GICD_CTRL.DS bit) * and if Group 0 interrupts can be delivered to Linux in the non-secure * world as FIQs (controlled by the SCR_EL3.FIQ bit). These affect the * way priorities are presented in ICC_PMR_EL1 and in the distributor: * * GICD_CTRL.DS | SCR_EL3.FIQ | ICC_PMR_EL1 | Distributor * ------------------------------------------------------- * 1 | - | unchanged | unchanged * ------------------------------------------------------- * 0 | 1 | non-secure | non-secure * ------------------------------------------------------- * 0 | 0 | unchanged | non-secure * * In the non-secure view reads and writes are modified: * * - A value written is right-shifted by one and the MSB is set, * forcing the priority into the non-secure range. * * - A value read is left-shifted by one. * * In the first two cases, where ICC_PMR_EL1 and the interrupt priority * are both either modified or unchanged, we can use the same set of * priorities. * * In the last case, where only the interrupt priorities are modified to * be in the non-secure range, we program the non-secure values into * the distributor to match the PMR values we want. */ if (cpus_have_group0 && !cpus_have_security_disabled) { dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq); dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi); } pr_info("GICD_CTRL.DS=%d, SCR_EL3.FIQ=%d\n", cpus_have_security_disabled, !cpus_have_group0); } /* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */ static refcount_t *rdist_nmi_refs; static struct gic_kvm_info gic_v3_kvm_info __initdata; static DEFINE_PER_CPU(bool, has_rss); #define MPIDR_RS(mpidr) (((mpidr) & 0xF0UL) >> 4) #define gic_data_rdist() (this_cpu_ptr(gic_data.rdists.rdist)) #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_sgi_base() (gic_data_rdist_rd_base() + SZ_64K) /* Our default, arbitrary priority value. Linux only uses one anyway. */ #define DEFAULT_PMR_VALUE 0xf0 enum gic_intid_range { SGI_RANGE, PPI_RANGE, SPI_RANGE, EPPI_RANGE, ESPI_RANGE, LPI_RANGE, __INVALID_RANGE__ }; static enum gic_intid_range __get_intid_range(irq_hw_number_t hwirq) { switch (hwirq) { case 0 ... 15: return SGI_RANGE; case 16 ... 31: return PPI_RANGE; case 32 ... 1019: return SPI_RANGE; case EPPI_BASE_INTID ... (EPPI_BASE_INTID + 63): return EPPI_RANGE; case ESPI_BASE_INTID ... (ESPI_BASE_INTID + 1023): return ESPI_RANGE; case 8192 ... GENMASK(23, 0): return LPI_RANGE; default: return __INVALID_RANGE__; } } static enum gic_intid_range get_intid_range(struct irq_data *d) { return __get_intid_range(d->hwirq); } static inline bool gic_irq_in_rdist(struct irq_data *d) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: return true; default: return false; } } static inline void __iomem *gic_dist_base_alias(struct irq_data *d) { if (static_branch_unlikely(&gic_nvidia_t241_erratum)) { irq_hw_number_t hwirq = irqd_to_hwirq(d); u32 chip; /* * For the erratum T241-FABRIC-4, read accesses to GICD_In{E} * registers are directed to the chip that owns the SPI. The * the alias region can also be used for writes to the * GICD_In{E} except GICD_ICENABLERn. Each chip has support * for 320 {E}SPIs. Mappings for all 4 chips: * Chip0 = 32-351 * Chip1 = 352-671 * Chip2 = 672-991 * Chip3 = 4096-4415 */ switch (__get_intid_range(hwirq)) { case SPI_RANGE: chip = (hwirq - 32) / 320; break; case ESPI_RANGE: chip = 3; break; default: unreachable(); } return t241_dist_base_alias[chip]; } return gic_data.dist_base; } static inline void __iomem *gic_dist_base(struct irq_data *d) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: /* SGI+PPI -> SGI_base for this CPU */ return gic_data_rdist_sgi_base(); case SPI_RANGE: case ESPI_RANGE: /* SPI -> dist_base */ return gic_data.dist_base; default: return NULL; } } static void gic_do_wait_for_rwp(void __iomem *base, u32 bit) { u32 val; int ret; ret = readl_relaxed_poll_timeout_atomic(base + GICD_CTLR, val, !(val & bit), 1, USEC_PER_SEC); if (ret == -ETIMEDOUT) pr_err_ratelimited("RWP timeout, gone fishing\n"); } /* Wait for completion of a distributor change */ static void gic_dist_wait_for_rwp(void) { gic_do_wait_for_rwp(gic_data.dist_base, GICD_CTLR_RWP); } /* Wait for completion of a redistributor change */ static void gic_redist_wait_for_rwp(void) { gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP); } static void gic_enable_redist(bool enable) { void __iomem *rbase; u32 val; int ret; if (gic_data.flags & FLAGS_WORKAROUND_GICR_WAKER_MSM8996) return; rbase = gic_data_rdist_rd_base(); val = readl_relaxed(rbase + GICR_WAKER); if (enable) /* Wake up this CPU redistributor */ val &= ~GICR_WAKER_ProcessorSleep; else val |= GICR_WAKER_ProcessorSleep; writel_relaxed(val, rbase + GICR_WAKER); if (!enable) { /* Check that GICR_WAKER is writeable */ val = readl_relaxed(rbase + GICR_WAKER); if (!(val & GICR_WAKER_ProcessorSleep)) return; /* No PM support in this redistributor */ } ret = readl_relaxed_poll_timeout_atomic(rbase + GICR_WAKER, val, enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep), 1, USEC_PER_SEC); if (ret == -ETIMEDOUT) { pr_err_ratelimited("redistributor failed to %s...\n", enable ? "wakeup" : "sleep"); } } /* * Routines to disable, enable, EOI and route interrupts */ static u32 convert_offset_index(struct irq_data *d, u32 offset, u32 *index) { switch (get_intid_range(d)) { case SGI_RANGE: case PPI_RANGE: case SPI_RANGE: *index = d->hwirq; return offset; case EPPI_RANGE: /* * Contrary to the ESPI range, the EPPI range is contiguous * to the PPI range in the registers, so let's adjust the * displacement accordingly. Consistency is overrated. */ *index = d->hwirq - EPPI_BASE_INTID + 32; return offset; case ESPI_RANGE: *index = d->hwirq - ESPI_BASE_INTID; switch (offset) { case GICD_ISENABLER: return GICD_ISENABLERnE; case GICD_ICENABLER: return GICD_ICENABLERnE; case GICD_ISPENDR: return GICD_ISPENDRnE; case GICD_ICPENDR: return GICD_ICPENDRnE; case GICD_ISACTIVER: return GICD_ISACTIVERnE; case GICD_ICACTIVER: return GICD_ICACTIVERnE; case GICD_IPRIORITYR: return GICD_IPRIORITYRnE; case GICD_ICFGR: return GICD_ICFGRnE; case GICD_IROUTER: return GICD_IROUTERnE; default: break; } break; default: break; } WARN_ON(1); *index = d->hwirq; return offset; } static int gic_peek_irq(struct irq_data *d, u32 offset) { void __iomem *base; u32 index, mask; offset = convert_offset_index(d, offset, &index); mask = 1 << (index % 32); if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_dist_base_alias(d); return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask); } static void gic_poke_irq(struct irq_data *d, u32 offset) { void __iomem *base; u32 index, mask; offset = convert_offset_index(d, offset, &index); mask = 1 << (index % 32); if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_data.dist_base; writel_relaxed(mask, base + offset + (index / 32) * 4); } static void gic_mask_irq(struct irq_data *d) { gic_poke_irq(d, GICD_ICENABLER); if (gic_irq_in_rdist(d)) gic_redist_wait_for_rwp(); else gic_dist_wait_for_rwp(); } static void gic_eoimode1_mask_irq(struct irq_data *d) { gic_mask_irq(d); /* * When masking a forwarded interrupt, make sure it is * deactivated as well. * * This ensures that an interrupt that is getting * disabled/masked will not get "stuck", because there is * noone to deactivate it (guest is being terminated). */ if (irqd_is_forwarded_to_vcpu(d)) gic_poke_irq(d, GICD_ICACTIVER); } static void gic_unmask_irq(struct irq_data *d) { gic_poke_irq(d, GICD_ISENABLER); } static inline bool gic_supports_nmi(void) { return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && static_branch_likely(&supports_pseudo_nmis); } static int gic_irq_set_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool val) { u32 reg; if (d->hwirq >= 8192) /* SGI/PPI/SPI only */ return -EINVAL; switch (which) { case IRQCHIP_STATE_PENDING: reg = val ? GICD_ISPENDR : GICD_ICPENDR; break; case IRQCHIP_STATE_ACTIVE: reg = val ? GICD_ISACTIVER : GICD_ICACTIVER; break; case IRQCHIP_STATE_MASKED: if (val) { gic_mask_irq(d); return 0; } reg = GICD_ISENABLER; break; default: return -EINVAL; } gic_poke_irq(d, reg); /* * Force read-back to guarantee that the active state has taken * effect, and won't race with a guest-driven deactivation. */ if (reg == GICD_ISACTIVER) gic_peek_irq(d, reg); return 0; } static int gic_irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *val) { if (d->hwirq >= 8192) /* PPI/SPI only */ return -EINVAL; switch (which) { case IRQCHIP_STATE_PENDING: *val = gic_peek_irq(d, GICD_ISPENDR); break; case IRQCHIP_STATE_ACTIVE: *val = gic_peek_irq(d, GICD_ISACTIVER); break; case IRQCHIP_STATE_MASKED: *val = !gic_peek_irq(d, GICD_ISENABLER); break; default: return -EINVAL; } return 0; } static void gic_irq_set_prio(struct irq_data *d, u8 prio) { void __iomem *base = gic_dist_base(d); u32 offset, index; offset = convert_offset_index(d, GICD_IPRIORITYR, &index); writeb_relaxed(prio, base + offset + index); } static u32 __gic_get_ppi_index(irq_hw_number_t hwirq) { switch (__get_intid_range(hwirq)) { case PPI_RANGE: return hwirq - 16; case EPPI_RANGE: return hwirq - EPPI_BASE_INTID + 16; default: unreachable(); } } static u32 __gic_get_rdist_index(irq_hw_number_t hwirq) { switch (__get_intid_range(hwirq)) { case SGI_RANGE: case PPI_RANGE: return hwirq; case EPPI_RANGE: return hwirq - EPPI_BASE_INTID + 32; default: unreachable(); } } static u32 gic_get_rdist_index(struct irq_data *d) { return __gic_get_rdist_index(d->hwirq); } static int gic_irq_nmi_setup(struct irq_data *d) { struct irq_desc *desc = irq_to_desc(d->irq); if (!gic_supports_nmi()) return -EINVAL; if (gic_peek_irq(d, GICD_ISENABLER)) { pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq); return -EINVAL; } /* * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return -EINVAL; /* desc lock should already be held */ if (gic_irq_in_rdist(d)) { u32 idx = gic_get_rdist_index(d); /* * Setting up a percpu interrupt as NMI, only switch handler * for first NMI */ if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) { refcount_set(&rdist_nmi_refs[idx], 1); desc->handle_irq = handle_percpu_devid_fasteoi_nmi; } } else { desc->handle_irq = handle_fasteoi_nmi; } gic_irq_set_prio(d, dist_prio_nmi); return 0; } static void gic_irq_nmi_teardown(struct irq_data *d) { struct irq_desc *desc = irq_to_desc(d->irq); if (WARN_ON(!gic_supports_nmi())) return; if (gic_peek_irq(d, GICD_ISENABLER)) { pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq); return; } /* * A secondary irq_chip should be in charge of LPI request, * it should not be possible to get there */ if (WARN_ON(irqd_to_hwirq(d) >= 8192)) return; /* desc lock should already be held */ if (gic_irq_in_rdist(d)) { u32 idx = gic_get_rdist_index(d); /* Tearing down NMI, only switch handler for last NMI */ if (refcount_dec_and_test(&rdist_nmi_refs[idx])) desc->handle_irq = handle_percpu_devid_irq; } else { desc->handle_irq = handle_fasteoi_irq; } gic_irq_set_prio(d, dist_prio_irq); } static bool gic_arm64_erratum_2941627_needed(struct irq_data *d) { enum gic_intid_range range; if (!static_branch_unlikely(&gic_arm64_2941627_erratum)) return false; range = get_intid_range(d); /* * The workaround is needed if the IRQ is an SPI and * the target cpu is different from the one we are * executing on. */ return (range == SPI_RANGE || range == ESPI_RANGE) && !cpumask_test_cpu(raw_smp_processor_id(), irq_data_get_effective_affinity_mask(d)); } static void gic_eoi_irq(struct irq_data *d) { write_gicreg(irqd_to_hwirq(d), ICC_EOIR1_EL1); isb(); if (gic_arm64_erratum_2941627_needed(d)) { /* * Make sure the GIC stream deactivate packet * issued by ICC_EOIR1_EL1 has completed before * deactivating through GICD_IACTIVER. */ dsb(sy); gic_poke_irq(d, GICD_ICACTIVER); } } static void gic_eoimode1_eoi_irq(struct irq_data *d) { /* * No need to deactivate an LPI, or an interrupt that * is is getting forwarded to a vcpu. */ if (irqd_to_hwirq(d) >= 8192 || irqd_is_forwarded_to_vcpu(d)) return; if (!gic_arm64_erratum_2941627_needed(d)) gic_write_dir(irqd_to_hwirq(d)); else gic_poke_irq(d, GICD_ICACTIVER); } static int gic_set_type(struct irq_data *d, unsigned int type) { irq_hw_number_t irq = irqd_to_hwirq(d); enum gic_intid_range range; void __iomem *base; u32 offset, index; int ret; range = get_intid_range(d); /* Interrupt configuration for SGIs can't be changed */ if (range == SGI_RANGE) return type != IRQ_TYPE_EDGE_RISING ? -EINVAL : 0; /* SPIs have restrictions on the supported types */ if ((range == SPI_RANGE || range == ESPI_RANGE) && type != IRQ_TYPE_LEVEL_HIGH && type != IRQ_TYPE_EDGE_RISING) return -EINVAL; if (gic_irq_in_rdist(d)) base = gic_data_rdist_sgi_base(); else base = gic_dist_base_alias(d); offset = convert_offset_index(d, GICD_ICFGR, &index); ret = gic_configure_irq(index, type, base + offset); if (ret && (range == PPI_RANGE || range == EPPI_RANGE)) { /* Misconfigured PPIs are usually not fatal */ pr_warn("GIC: PPI INTID%ld is secure or misconfigured\n", irq); ret = 0; } return ret; } static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { if (get_intid_range(d) == SGI_RANGE) return -EINVAL; if (vcpu) irqd_set_forwarded_to_vcpu(d); else irqd_clr_forwarded_to_vcpu(d); return 0; } static u64 gic_cpu_to_affinity(int cpu) { u64 mpidr = cpu_logical_map(cpu); u64 aff; /* ASR8601 needs to have its affinities shifted down... */ if (unlikely(gic_data.flags & FLAGS_WORKAROUND_ASR_ERRATUM_8601001)) mpidr = (MPIDR_AFFINITY_LEVEL(mpidr, 1) | (MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8)); aff = ((u64)MPIDR_AFFINITY_LEVEL(mpidr, 3) << 32 | MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 | MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8 | MPIDR_AFFINITY_LEVEL(mpidr, 0)); return aff; } static void gic_deactivate_unhandled(u32 irqnr) { if (static_branch_likely(&supports_deactivate_key)) { if (irqnr < 8192) gic_write_dir(irqnr); } else { write_gicreg(irqnr, ICC_EOIR1_EL1); isb(); } } /* * Follow a read of the IAR with any HW maintenance that needs to happen prior * to invoking the relevant IRQ handler. We must do two things: * * (1) Ensure instruction ordering between a read of IAR and subsequent * instructions in the IRQ handler using an ISB. * * It is possible for the IAR to report an IRQ which was signalled *after* * the CPU took an IRQ exception as multiple interrupts can race to be * recognized by the GIC, earlier interrupts could be withdrawn, and/or * later interrupts could be prioritized by the GIC. * * For devices which are tightly coupled to the CPU, such as PMUs, a * context synchronization event is necessary to ensure that system * register state is not stale, as these may have been indirectly written * *after* exception entry. * * (2) Execute an interrupt priority drop when EOI mode 1 is in use. */ static inline void gic_complete_ack(u32 irqnr) { if (static_branch_likely(&supports_deactivate_key)) write_gicreg(irqnr, ICC_EOIR1_EL1); isb(); } static bool gic_rpr_is_nmi_prio(void) { if (!gic_supports_nmi()) return false; return unlikely(gic_read_rpr() == GICV3_PRIO_NMI); } static bool gic_irqnr_is_special(u32 irqnr) { return irqnr >= 1020 && irqnr <= 1023; } static void __gic_handle_irq(u32 irqnr, struct pt_regs *regs) { if (gic_irqnr_is_special(irqnr)) return; gic_complete_ack(irqnr); if (generic_handle_domain_irq(gic_data.domain, irqnr)) { WARN_ONCE(true, "Unexpected interrupt (irqnr %u)\n", irqnr); gic_deactivate_unhandled(irqnr); } } static void __gic_handle_nmi(u32 irqnr, struct pt_regs *regs) { if (gic_irqnr_is_special(irqnr)) return; gic_complete_ack(irqnr); if (generic_handle_domain_nmi(gic_data.domain, irqnr)) { WARN_ONCE(true, "Unexpected pseudo-NMI (irqnr %u)\n", irqnr); gic_deactivate_unhandled(irqnr); } } /* * An exception has been taken from a context with IRQs enabled, and this could * be an IRQ or an NMI. * * The entry code called us with DAIF.IF set to keep NMIs masked. We must clear * DAIF.IF (and update ICC_PMR_EL1 to mask regular IRQs) prior to returning, * after handling any NMI but before handling any IRQ. * * The entry code has performed IRQ entry, and if an NMI is detected we must * perform NMI entry/exit around invoking the handler. */ static void __gic_handle_irq_from_irqson(struct pt_regs *regs) { bool is_nmi; u32 irqnr; irqnr = gic_read_iar(); is_nmi = gic_rpr_is_nmi_prio(); if (is_nmi) { nmi_enter(); __gic_handle_nmi(irqnr, regs); nmi_exit(); } if (gic_prio_masking_enabled()) { gic_pmr_mask_irqs(); gic_arch_enable_irqs(); } if (!is_nmi) __gic_handle_irq(irqnr, regs); } /* * An exception has been taken from a context with IRQs disabled, which can only * be an NMI. * * The entry code called us with DAIF.IF set to keep NMIs masked. We must leave * DAIF.IF (and ICC_PMR_EL1) unchanged. * * The entry code has performed NMI entry. */ static void __gic_handle_irq_from_irqsoff(struct pt_regs *regs) { u64 pmr; u32 irqnr; /* * We were in a context with IRQs disabled. However, the * entry code has set PMR to a value that allows any * interrupt to be acknowledged, and not just NMIs. This can * lead to surprising effects if the NMI has been retired in * the meantime, and that there is an IRQ pending. The IRQ * would then be taken in NMI context, something that nobody * wants to debug twice. * * Until we sort this, drop PMR again to a level that will * actually only allow NMIs before reading IAR, and then * restore it to what it was. */ pmr = gic_read_pmr(); gic_pmr_mask_irqs(); isb(); irqnr = gic_read_iar(); gic_write_pmr(pmr); __gic_handle_nmi(irqnr, regs); } static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) { if (unlikely(gic_supports_nmi() && !interrupts_enabled(regs))) __gic_handle_irq_from_irqsoff(regs); else __gic_handle_irq_from_irqson(regs); } static void __init gic_dist_init(void) { unsigned int i; u64 affinity; void __iomem *base = gic_data.dist_base; u32 val; /* Disable the distributor */ writel_relaxed(0, base + GICD_CTLR); gic_dist_wait_for_rwp(); /* * Configure SPIs as non-secure Group-1. This will only matter * if the GIC only has a single security state. This will not * do the right thing if the kernel is running in secure mode, * but that's not the intended use case anyway. */ for (i = 32; i < GIC_LINE_NR; i += 32) writel_relaxed(~0, base + GICD_IGROUPR + i / 8); /* Extended SPI range, not handled by the GICv2/GICv3 common code */ for (i = 0; i < GIC_ESPI_NR; i += 32) { writel_relaxed(~0U, base + GICD_ICENABLERnE + i / 8); writel_relaxed(~0U, base + GICD_ICACTIVERnE + i / 8); } for (i = 0; i < GIC_ESPI_NR; i += 32) writel_relaxed(~0U, base + GICD_IGROUPRnE + i / 8); for (i = 0; i < GIC_ESPI_NR; i += 16) writel_relaxed(0, base + GICD_ICFGRnE + i / 4); for (i = 0; i < GIC_ESPI_NR; i += 4) writel_relaxed(REPEAT_BYTE_U32(dist_prio_irq), base + GICD_IPRIORITYRnE + i); /* Now do the common stuff */ gic_dist_config(base, GIC_LINE_NR, dist_prio_irq); val = GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | GICD_CTLR_ENABLE_G1; if (gic_data.rdists.gicd_typer2 & GICD_TYPER2_nASSGIcap) { pr_info("Enabling SGIs without active state\n"); val |= GICD_CTLR_nASSGIreq; } /* Enable distributor with ARE, Group1, and wait for it to drain */ writel_relaxed(val, base + GICD_CTLR); gic_dist_wait_for_rwp(); /* * Set all global interrupts to the boot CPU only. ARE must be * enabled. */ affinity = gic_cpu_to_affinity(smp_processor_id()); for (i = 32; i < GIC_LINE_NR; i++) gic_write_irouter(affinity, base + GICD_IROUTER + i * 8); for (i = 0; i < GIC_ESPI_NR; i++) gic_write_irouter(affinity, base + GICD_IROUTERnE + i * 8); } static int gic_iterate_rdists(int (*fn)(struct redist_region *, void __iomem *)) { int ret = -ENODEV; int i; for (i = 0; i < gic_data.nr_redist_regions; i++) { void __iomem *ptr = gic_data.redist_regions[i].redist_base; u64 typer; u32 reg; reg = readl_relaxed(ptr + GICR_PIDR2) & GIC_PIDR2_ARCH_MASK; if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) { /* We're in trouble... */ pr_warn("No redistributor present @%p\n", ptr); break; } do { typer = gic_read_typer(ptr + GICR_TYPER); ret = fn(gic_data.redist_regions + i, ptr); if (!ret) return 0; if (gic_data.redist_regions[i].single_redist) break; if (gic_data.redist_stride) { ptr += gic_data.redist_stride; } else { ptr += SZ_64K * 2; /* Skip RD_base + SGI_base */ if (typer & GICR_TYPER_VLPIS) ptr += SZ_64K * 2; /* Skip VLPI_base + reserved page */ } } while (!(typer & GICR_TYPER_LAST)); } return ret ? -ENODEV : 0; } static int __gic_populate_rdist(struct redist_region *region, void __iomem *ptr) { unsigned long mpidr; u64 typer; u32 aff; /* * Convert affinity to a 32bit value that can be matched to * GICR_TYPER bits [63:32]. */ mpidr = gic_cpu_to_affinity(smp_processor_id()); aff = (MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24 | MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 | MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8 | MPIDR_AFFINITY_LEVEL(mpidr, 0)); typer = gic_read_typer(ptr + GICR_TYPER); if ((typer >> 32) == aff) { u64 offset = ptr - region->redist_base; raw_spin_lock_init(&gic_data_rdist()->rd_lock); gic_data_rdist_rd_base() = ptr; gic_data_rdist()->phys_base = region->phys_base + offset; pr_info("CPU%d: found redistributor %lx region %d:%pa\n", smp_processor_id(), mpidr, (int)(region - gic_data.redist_regions), &gic_data_rdist()->phys_base); return 0; } /* Try next one */ return 1; } static int gic_populate_rdist(void) { if (gic_iterate_rdists(__gic_populate_rdist) == 0) return 0; /* We couldn't even deal with ourselves... */ WARN(true, "CPU%d: mpidr %lx has no re-distributor!\n", smp_processor_id(), (unsigned long)cpu_logical_map(smp_processor_id())); return -ENODEV; } static int __gic_update_rdist_properties(struct redist_region *region, void __iomem *ptr) { u64 typer = gic_read_typer(ptr + GICR_TYPER); u32 ctlr = readl_relaxed(ptr + GICR_CTLR); /* Boot-time cleanup */ if ((typer & GICR_TYPER_VLPIS) && (typer & GICR_TYPER_RVPEID)) { u64 val; /* Deactivate any present vPE */ val = gicr_read_vpendbaser(ptr + SZ_128K + GICR_VPENDBASER); if (val & GICR_VPENDBASER_Valid) gicr_write_vpendbaser(GICR_VPENDBASER_PendingLast, ptr + SZ_128K + GICR_VPENDBASER); /* Mark the VPE table as invalid */ val = gicr_read_vpropbaser(ptr + SZ_128K + GICR_VPROPBASER); val &= ~GICR_VPROPBASER_4_1_VALID; gicr_write_vpropbaser(val, ptr + SZ_128K + GICR_VPROPBASER); } gic_data.rdists.has_vlpis &= !!(typer & GICR_TYPER_VLPIS); /* * TYPER.RVPEID implies some form of DirectLPI, no matter what the * doc says... :-/ And CTLR.IR implies another subset of DirectLPI * that the ITS driver can make use of for LPIs (and not VLPIs). * * These are 3 different ways to express the same thing, depending * on the revision of the architecture and its relaxations over * time. Just group them under the 'direct_lpi' banner. */ gic_data.rdists.has_rvpeid &= !!(typer & GICR_TYPER_RVPEID); gic_data.rdists.has_direct_lpi &= (!!(typer & GICR_TYPER_DirectLPIS) | !!(ctlr & GICR_CTLR_IR) | gic_data.rdists.has_rvpeid); gic_data.rdists.has_vpend_valid_dirty &= !!(typer & GICR_TYPER_DIRTY); /* Detect non-sensical configurations */ if (WARN_ON_ONCE(gic_data.rdists.has_rvpeid && !gic_data.rdists.has_vlpis)) { gic_data.rdists.has_direct_lpi = false; gic_data.rdists.has_vlpis = false; gic_data.rdists.has_rvpeid = false; } gic_data.ppi_nr = min(GICR_TYPER_NR_PPIS(typer), gic_data.ppi_nr); return 1; } static void gic_update_rdist_properties(void) { gic_data.ppi_nr = UINT_MAX; gic_iterate_rdists(__gic_update_rdist_properties); if (WARN_ON(gic_data.ppi_nr == UINT_MAX)) gic_data.ppi_nr = 0; pr_info("GICv3 features: %d PPIs%s%s\n", gic_data.ppi_nr, gic_data.has_rss ? ", RSS" : "", gic_data.rdists.has_direct_lpi ? ", DirectLPI" : ""); if (gic_data.rdists.has_vlpis) pr_info("GICv4 features: %s%s%s\n", gic_data.rdists.has_direct_lpi ? "DirectLPI " : "", gic_data.rdists.has_rvpeid ? "RVPEID " : "", gic_data.rdists.has_vpend_valid_dirty ? "Valid+Dirty " : ""); } static void gic_cpu_sys_reg_enable(void) { /* * Need to check that the SRE bit has actually been set. If * not, it means that SRE is disabled at EL2. We're going to * die painfully, and there is nothing we can do about it. * * Kindly inform the luser. */ if (!gic_enable_sre()) pr_err("GIC: unable to set SRE (disabled at EL2), panic ahead\n"); } static void gic_cpu_sys_reg_init(void) { int i, cpu = smp_processor_id(); u64 mpidr = gic_cpu_to_affinity(cpu); u64 need_rss = MPIDR_RS(mpidr); bool group0; u32 pribits; pribits = gic_get_pribits(); group0 = gic_has_group0(); /* Set priority mask register */ if (!gic_prio_masking_enabled()) { write_gicreg(DEFAULT_PMR_VALUE, ICC_PMR_EL1); } else if (gic_supports_nmi()) { /* * Check that all CPUs use the same priority space. * * If there's a mismatch with the boot CPU, the system is * likely to die as interrupt masking will not work properly on * all CPUs. */ WARN_ON(group0 != cpus_have_group0); WARN_ON(gic_dist_security_disabled() != cpus_have_security_disabled); } /* * Some firmwares hand over to the kernel with the BPR changed from * its reset value (and with a value large enough to prevent * any pre-emptive interrupts from working at all). Writing a zero * to BPR restores is reset value. */ gic_write_bpr1(0); if (static_branch_likely(&supports_deactivate_key)) { /* EOI drops priority only (mode 1) */ gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop); } else { /* EOI deactivates interrupt too (mode 0) */ gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir); } /* Always whack Group0 before Group1 */ if (group0) { switch(pribits) { case 8: case 7: write_gicreg(0, ICC_AP0R3_EL1); write_gicreg(0, ICC_AP0R2_EL1); fallthrough; case 6: write_gicreg(0, ICC_AP0R1_EL1); fallthrough; case 5: case 4: write_gicreg(0, ICC_AP0R0_EL1); } isb(); } switch(pribits) { case 8: case 7: write_gicreg(0, ICC_AP1R3_EL1); write_gicreg(0, ICC_AP1R2_EL1); fallthrough; case 6: write_gicreg(0, ICC_AP1R1_EL1); fallthrough; case 5: case 4: write_gicreg(0, ICC_AP1R0_EL1); } isb(); /* ... and let's hit the road... */ gic_write_grpen1(1); /* Keep the RSS capability status in per_cpu variable */ per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS); /* Check all the CPUs have capable of sending SGIs to other CPUs */ for_each_online_cpu(i) { bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu); need_rss |= MPIDR_RS(gic_cpu_to_affinity(i)); if (need_rss && (!have_rss)) pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n", cpu, (unsigned long)mpidr, i, (unsigned long)gic_cpu_to_affinity(i)); } /** * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0, * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED * UNPREDICTABLE choice of : * - The write is ignored. * - The RS field is treated as 0. */ if (need_rss && (!gic_data.has_rss)) pr_crit_once("RSS is required but GICD doesn't support it\n"); } static bool gicv3_nolpi; static int __init gicv3_nolpi_cfg(char *buf) { return kstrtobool(buf, &gicv3_nolpi); } early_param("irqchip.gicv3_nolpi", gicv3_nolpi_cfg); static int gic_dist_supports_lpis(void) { return (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && !!(readl_relaxed(gic_data.dist_base + GICD_TYPER) & GICD_TYPER_LPIS) && !gicv3_nolpi); } static void gic_cpu_init(void) { void __iomem *rbase; int i; /* Register ourselves with the rest of the world */ if (gic_populate_rdist()) return; gic_enable_redist(true); WARN((gic_data.ppi_nr > 16 || GIC_ESPI_NR != 0) && !(gic_read_ctlr() & ICC_CTLR_EL1_ExtRange), "Distributor has extended ranges, but CPU%d doesn't\n", smp_processor_id()); rbase = gic_data_rdist_sgi_base(); /* Configure SGIs/PPIs as non-secure Group-1 */ for (i = 0; i < gic_data.ppi_nr + SGI_NR; i += 32) writel_relaxed(~0, rbase + GICR_IGROUPR0 + i / 8); gic_cpu_config(rbase, gic_data.ppi_nr + SGI_NR, dist_prio_irq); gic_redist_wait_for_rwp(); /* initialise system registers */ gic_cpu_sys_reg_init(); } #ifdef CONFIG_SMP #define MPIDR_TO_SGI_RS(mpidr) (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT) #define MPIDR_TO_SGI_CLUSTER_ID(mpidr) ((mpidr) & ~0xFUL) /* * gic_starting_cpu() is called after the last point where cpuhp is allowed * to fail. So pre check for problems earlier. */ static int gic_check_rdist(unsigned int cpu) { if (cpumask_test_cpu(cpu, &broken_rdists)) return -EINVAL; return 0; } static int gic_starting_cpu(unsigned int cpu) { gic_cpu_sys_reg_enable(); gic_cpu_init(); if (gic_dist_supports_lpis()) its_cpu_init(); return 0; } static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask, unsigned long cluster_id) { int next_cpu, cpu = *base_cpu; unsigned long mpidr; u16 tlist = 0; mpidr = gic_cpu_to_affinity(cpu); while (cpu < nr_cpu_ids) { tlist |= 1 << (mpidr & 0xf); next_cpu = cpumask_next(cpu, mask); if (next_cpu >= nr_cpu_ids) goto out; cpu = next_cpu; mpidr = gic_cpu_to_affinity(cpu); if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) { cpu--; goto out; } } out: *base_cpu = cpu; return tlist; } #define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \ (MPIDR_AFFINITY_LEVEL(cluster_id, level) \ << ICC_SGI1R_AFFINITY_## level ##_SHIFT) static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq) { u64 val; val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3) | MPIDR_TO_SGI_AFFINITY(cluster_id, 2) | irq << ICC_SGI1R_SGI_ID_SHIFT | MPIDR_TO_SGI_AFFINITY(cluster_id, 1) | MPIDR_TO_SGI_RS(cluster_id) | tlist << ICC_SGI1R_TARGET_LIST_SHIFT); pr_devel("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); gic_write_sgi1r(val); } static void gic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask) { int cpu; if (WARN_ON(d->hwirq >= 16)) return; /* * Ensure that stores to Normal memory are visible to the * other CPUs before issuing the IPI. */ dsb(ishst); for_each_cpu(cpu, mask) { u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(gic_cpu_to_affinity(cpu)); u16 tlist; tlist = gic_compute_target_list(&cpu, mask, cluster_id); gic_send_sgi(cluster_id, tlist, d->hwirq); } /* Force the above writes to ICC_SGI1R_EL1 to be executed */ isb(); } static void __init gic_smp_init(void) { struct irq_fwspec sgi_fwspec = { .fwnode = gic_data.fwnode, .param_count = 1, }; int base_sgi; cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN, "irqchip/arm/gicv3:checkrdist", gic_check_rdist, NULL); cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_GIC_STARTING, "irqchip/arm/gicv3:starting", gic_starting_cpu, NULL); /* Register all 8 non-secure SGIs */ base_sgi = irq_domain_alloc_irqs(gic_data.domain, 8, NUMA_NO_NODE, &sgi_fwspec); if (WARN_ON(base_sgi <= 0)) return; set_smp_ipi_range(base_sgi, 8); } static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) { unsigned int cpu; u32 offset, index; void __iomem *reg; int enabled; u64 val; if (force) cpu = cpumask_first(mask_val); else cpu = cpumask_any_and(mask_val, cpu_online_mask); if (cpu >= nr_cpu_ids) return -EINVAL; if (gic_irq_in_rdist(d)) return -EINVAL; /* If interrupt was enabled, disable it first */ enabled = gic_peek_irq(d, GICD_ISENABLER); if (enabled) gic_mask_irq(d); offset = convert_offset_index(d, GICD_IROUTER, &index); reg = gic_dist_base(d) + offset + (index * 8); val = gic_cpu_to_affinity(cpu); gic_write_irouter(val, reg); /* * If the interrupt was enabled, enabled it again. Otherwise, * just wait for the distributor to have digested our changes. */ if (enabled) gic_unmask_irq(d); irq_data_update_effective_affinity(d, cpumask_of(cpu)); return IRQ_SET_MASK_OK_DONE; } #else #define gic_set_affinity NULL #define gic_ipi_send_mask NULL #define gic_smp_init() do { } while(0) #endif static int gic_retrigger(struct irq_data *data) { return !gic_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING, true); } #ifdef CONFIG_CPU_PM static int gic_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { if (cmd == CPU_PM_EXIT || cmd == CPU_PM_ENTER_FAILED) { if (gic_dist_security_disabled()) gic_enable_redist(true); gic_cpu_sys_reg_enable(); gic_cpu_sys_reg_init(); } else if (cmd == CPU_PM_ENTER && gic_dist_security_disabled()) { gic_write_grpen1(0); gic_enable_redist(false); } return NOTIFY_OK; } static struct notifier_block gic_cpu_pm_notifier_block = { .notifier_call = gic_cpu_pm_notifier, }; static void gic_cpu_pm_init(void) { cpu_pm_register_notifier(&gic_cpu_pm_notifier_block); } #else static inline void gic_cpu_pm_init(void) { } #endif /* CONFIG_CPU_PM */ static struct irq_chip gic_chip = { .name = "GICv3", .irq_mask = gic_mask_irq, .irq_unmask = gic_unmask_irq, .irq_eoi = gic_eoi_irq, .irq_set_type = gic_set_type, .irq_set_affinity = gic_set_affinity, .irq_retrigger = gic_retrigger, .irq_get_irqchip_state = gic_irq_get_irqchip_state, .irq_set_irqchip_state = gic_irq_set_irqchip_state, .irq_nmi_setup = gic_irq_nmi_setup, .irq_nmi_teardown = gic_irq_nmi_teardown, .ipi_send_mask = gic_ipi_send_mask, .flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static struct irq_chip gic_eoimode1_chip = { .name = "GICv3", .irq_mask = gic_eoimode1_mask_irq, .irq_unmask = gic_unmask_irq, .irq_eoi = gic_eoimode1_eoi_irq, .irq_set_type = gic_set_type, .irq_set_affinity = gic_set_affinity, .irq_retrigger = gic_retrigger, .irq_get_irqchip_state = gic_irq_get_irqchip_state, .irq_set_irqchip_state = gic_irq_set_irqchip_state, .irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity, .irq_nmi_setup = gic_irq_nmi_setup, .irq_nmi_teardown = gic_irq_nmi_teardown, .ipi_send_mask = gic_ipi_send_mask, .flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) { struct irq_chip *chip = &gic_chip; struct irq_data *irqd = irq_desc_get_irq_data(irq_to_desc(irq)); if (static_branch_likely(&supports_deactivate_key)) chip = &gic_eoimode1_chip; switch (__get_intid_range(hw)) { case SGI_RANGE: case PPI_RANGE: case EPPI_RANGE: irq_set_percpu_devid(irq); irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_percpu_devid_irq, NULL, NULL); break; case SPI_RANGE: case ESPI_RANGE: irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_fasteoi_irq, NULL, NULL); irq_set_probe(irq); irqd_set_single_target(irqd); break; case LPI_RANGE: if (!gic_dist_supports_lpis()) return -EPERM; irq_domain_set_info(d, irq, hw, chip, d->host_data, handle_fasteoi_irq, NULL, NULL); break; default: return -EPERM; } /* Prevents SW retriggers which mess up the ACK/EOI ordering */ irqd_set_handle_enforce_irqctx(irqd); return 0; } static int gic_irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { if (fwspec->param_count == 1 && fwspec->param[0] < 16) { *hwirq = fwspec->param[0]; *type = IRQ_TYPE_EDGE_RISING; return 0; } if (is_of_node(fwspec->fwnode)) { if (fwspec->param_count < 3) return -EINVAL; switch (fwspec->param[0]) { case 0: /* SPI */ *hwirq = fwspec->param[1] + 32; break; case 1: /* PPI */ *hwirq = fwspec->param[1] + 16; break; case 2: /* ESPI */ *hwirq = fwspec->param[1] + ESPI_BASE_INTID; break; case 3: /* EPPI */ *hwirq = fwspec->param[1] + EPPI_BASE_INTID; break; case GIC_IRQ_TYPE_LPI: /* LPI */ *hwirq = fwspec->param[1]; break; case GIC_IRQ_TYPE_PARTITION: *hwirq = fwspec->param[1]; if (fwspec->param[1] >= 16) *hwirq += EPPI_BASE_INTID - 16; else *hwirq += 16; break; default: return -EINVAL; } *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; /* * Make it clear that broken DTs are... broken. * Partitioned PPIs are an unfortunate exception. */ WARN_ON(*type == IRQ_TYPE_NONE && fwspec->param[0] != GIC_IRQ_TYPE_PARTITION); return 0; } if (is_fwnode_irqchip(fwspec->fwnode)) { if(fwspec->param_count != 2) return -EINVAL; if (fwspec->param[0] < 16) { pr_err(FW_BUG "Illegal GSI%d translation request\n", fwspec->param[0]); return -EINVAL; } *hwirq = fwspec->param[0]; *type = fwspec->param[1]; WARN_ON(*type == IRQ_TYPE_NONE); return 0; } return -EINVAL; } static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { int i, ret; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; struct irq_fwspec *fwspec = arg; ret = gic_irq_domain_translate(domain, fwspec, &hwirq, &type); if (ret) return ret; for (i = 0; i < nr_irqs; i++) { ret = gic_irq_domain_map(domain, virq + i, hwirq + i); if (ret) return ret; } return 0; } static void gic_irq_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { int i; for (i = 0; i < nr_irqs; i++) { struct irq_data *d = irq_domain_get_irq_data(domain, virq + i); irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); } } static bool fwspec_is_partitioned_ppi(struct irq_fwspec *fwspec, irq_hw_number_t hwirq) { enum gic_intid_range range; if (!gic_data.ppi_descs) return false; if (!is_of_node(fwspec->fwnode)) return false; if (fwspec->param_count < 4 || !fwspec->param[3]) return false; range = __get_intid_range(hwirq); if (range != PPI_RANGE && range != EPPI_RANGE) return false; return true; } static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { unsigned int type, ret, ppi_idx; irq_hw_number_t hwirq; /* Not for us */ if (fwspec->fwnode != d->fwnode) return 0; /* Handle pure domain searches */ if (!fwspec->param_count) return d->bus_token == bus_token; /* If this is not DT, then we have a single domain */ if (!is_of_node(fwspec->fwnode)) return 1; ret = gic_irq_domain_translate(d, fwspec, &hwirq, &type); if (WARN_ON_ONCE(ret)) return 0; if (!fwspec_is_partitioned_ppi(fwspec, hwirq)) return d == gic_data.domain; /* * If this is a PPI and we have a 4th (non-null) parameter, * then we need to match the partition domain. */ ppi_idx = __gic_get_ppi_index(hwirq); return d == partition_get_domain(gic_data.ppi_descs[ppi_idx]); } static const struct irq_domain_ops gic_irq_domain_ops = { .translate = gic_irq_domain_translate, .alloc = gic_irq_domain_alloc, .free = gic_irq_domain_free, .select = gic_irq_domain_select, }; static int partition_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { unsigned long ppi_intid; struct device_node *np; unsigned int ppi_idx; int ret; if (!gic_data.ppi_descs) return -ENOMEM; np = of_find_node_by_phandle(fwspec->param[3]); if (WARN_ON(!np)) return -EINVAL; ret = gic_irq_domain_translate(d, fwspec, &ppi_intid, type); if (WARN_ON_ONCE(ret)) return 0; ppi_idx = __gic_get_ppi_index(ppi_intid); ret = partition_translate_id(gic_data.ppi_descs[ppi_idx], of_fwnode_handle(np)); if (ret < 0) return ret; *hwirq = ret; *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; return 0; } static const struct irq_domain_ops partition_domain_ops = { .translate = partition_domain_translate, .select = gic_irq_domain_select, }; static bool gic_enable_quirk_msm8996(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_GICR_WAKER_MSM8996; return true; } static bool gic_enable_quirk_cavium_38539(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539; return true; } static bool gic_enable_quirk_hip06_07(void *data) { struct gic_chip_data *d = data; /* * HIP06 GICD_IIDR clashes with GIC-600 product number (despite * not being an actual ARM implementation). The saving grace is * that GIC-600 doesn't have ESPI, so nothing to do in that case. * HIP07 doesn't even have a proper IIDR, and still pretends to * have ESPI. In both cases, put them right. */ if (d->rdists.gicd_typer & GICD_TYPER_ESPI) { /* Zero both ESPI and the RES0 field next to it... */ d->rdists.gicd_typer &= ~GENMASK(9, 8); return true; } return false; } #define T241_CHIPN_MASK GENMASK_ULL(45, 44) #define T241_CHIP_GICDA_OFFSET 0x1580000 #define SMCCC_SOC_ID_T241 0x036b0241 static bool gic_enable_quirk_nvidia_t241(void *data) { s32 soc_id = arm_smccc_get_soc_id_version(); unsigned long chip_bmask = 0; phys_addr_t phys; u32 i; /* Check JEP106 code for NVIDIA T241 chip (036b:0241) */ if ((soc_id < 0) || (soc_id != SMCCC_SOC_ID_T241)) return false; /* Find the chips based on GICR regions PHYS addr */ for (i = 0; i < gic_data.nr_redist_regions; i++) { chip_bmask |= BIT(FIELD_GET(T241_CHIPN_MASK, (u64)gic_data.redist_regions[i].phys_base)); } if (hweight32(chip_bmask) < 3) return false; /* Setup GICD alias regions */ for (i = 0; i < ARRAY_SIZE(t241_dist_base_alias); i++) { if (chip_bmask & BIT(i)) { phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET; phys |= FIELD_PREP(T241_CHIPN_MASK, i); t241_dist_base_alias[i] = ioremap(phys, SZ_64K); WARN_ON_ONCE(!t241_dist_base_alias[i]); } } static_branch_enable(&gic_nvidia_t241_erratum); return true; } static bool gic_enable_quirk_asr8601(void *data) { struct gic_chip_data *d = data; d->flags |= FLAGS_WORKAROUND_ASR_ERRATUM_8601001; return true; } static bool gic_enable_quirk_arm64_2941627(void *data) { static_branch_enable(&gic_arm64_2941627_erratum); return true; } static bool gic_enable_quirk_rk3399(void *data) { struct gic_chip_data *d = data; if (of_machine_is_compatible("rockchip,rk3399")) { d->flags |= FLAGS_WORKAROUND_INSECURE; return true; } return false; } static bool rd_set_non_coherent(void *data) { struct gic_chip_data *d = data; d->rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; return true; } static const struct gic_quirk gic_quirks[] = { { .desc = "GICv3: Qualcomm MSM8996 broken firmware", .compatible = "qcom,msm8996-gic-v3", .init = gic_enable_quirk_msm8996, }, { .desc = "GICv3: ASR erratum 8601001", .compatible = "asr,asr8601-gic-v3", .init = gic_enable_quirk_asr8601, }, { .desc = "GICv3: HIP06 erratum 161010803", .iidr = 0x0204043b, .mask = 0xffffffff, .init = gic_enable_quirk_hip06_07, }, { .desc = "GICv3: HIP07 erratum 161010803", .iidr = 0x00000000, .mask = 0xffffffff, .init = gic_enable_quirk_hip06_07, }, { /* * Reserved register accesses generate a Synchronous * External Abort. This erratum applies to: * - ThunderX: CN88xx * - OCTEON TX: CN83xx, CN81xx * - OCTEON TX2: CN93xx, CN96xx, CN98xx, CNF95xx* */ .desc = "GICv3: Cavium erratum 38539", .iidr = 0xa000034c, .mask = 0xe8f00fff, .init = gic_enable_quirk_cavium_38539, }, { .desc = "GICv3: NVIDIA erratum T241-FABRIC-4", .iidr = 0x0402043b, .mask = 0xffffffff, .init = gic_enable_quirk_nvidia_t241, }, { /* * GIC-700: 2941627 workaround - IP variant [0,1] * */ .desc = "GICv3: ARM64 erratum 2941627", .iidr = 0x0400043b, .mask = 0xff0e0fff, .init = gic_enable_quirk_arm64_2941627, }, { /* * GIC-700: 2941627 workaround - IP variant [2] */ .desc = "GICv3: ARM64 erratum 2941627", .iidr = 0x0402043b, .mask = 0xff0f0fff, .init = gic_enable_quirk_arm64_2941627, }, { .desc = "GICv3: non-coherent attribute", .property = "dma-noncoherent", .init = rd_set_non_coherent, }, { .desc = "GICv3: Insecure RK3399 integration", .iidr = 0x0000043b, .mask = 0xff000fff, .init = gic_enable_quirk_rk3399, }, { } }; static void gic_enable_nmi_support(void) { int i; if (!gic_prio_masking_enabled() || nmi_support_forbidden) return; rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR, sizeof(*rdist_nmi_refs), GFP_KERNEL); if (!rdist_nmi_refs) return; for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++) refcount_set(&rdist_nmi_refs[i], 0); pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n", gic_has_relaxed_pmr_sync() ? "relaxed" : "forced"); static_branch_enable(&supports_pseudo_nmis); if (static_branch_likely(&supports_deactivate_key)) gic_eoimode1_chip.flags |= IRQCHIP_SUPPORTS_NMI; else gic_chip.flags |= IRQCHIP_SUPPORTS_NMI; } static int __init gic_init_bases(phys_addr_t dist_phys_base, void __iomem *dist_base, struct redist_region *rdist_regs, u32 nr_redist_regions, u64 redist_stride, struct fwnode_handle *handle) { u32 typer; int err; if (!is_hyp_mode_available()) static_branch_disable(&supports_deactivate_key); if (static_branch_likely(&supports_deactivate_key)) pr_info("GIC: Using split EOI/Deactivate mode\n"); gic_data.fwnode = handle; gic_data.dist_phys_base = dist_phys_base; gic_data.dist_base = dist_base; gic_data.redist_regions = rdist_regs; gic_data.nr_redist_regions = nr_redist_regions; gic_data.redist_stride = redist_stride; /* * Find out how many interrupts are supported. */ typer = readl_relaxed(gic_data.dist_base + GICD_TYPER); gic_data.rdists.gicd_typer = typer; gic_enable_quirks(readl_relaxed(gic_data.dist_base + GICD_IIDR), gic_quirks, &gic_data); pr_info("%d SPIs implemented\n", GIC_LINE_NR - 32); pr_info("%d Extended SPIs implemented\n", GIC_ESPI_NR); /* * ThunderX1 explodes on reading GICD_TYPER2, in violation of the * architecture spec (which says that reserved registers are RES0). */ if (!(gic_data.flags & FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539)) gic_data.rdists.gicd_typer2 = readl_relaxed(gic_data.dist_base + GICD_TYPER2); gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops, &gic_data); gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); if (!static_branch_unlikely(&gic_nvidia_t241_erratum)) { /* Disable GICv4.x features for the erratum T241-FABRIC-4 */ gic_data.rdists.has_rvpeid = true; gic_data.rdists.has_vlpis = true; gic_data.rdists.has_direct_lpi = true; gic_data.rdists.has_vpend_valid_dirty = true; } if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) { err = -ENOMEM; goto out_free; } irq_domain_update_bus_token(gic_data.domain, DOMAIN_BUS_WIRED); gic_data.has_rss = !!(typer & GICD_TYPER_RSS); if (typer & GICD_TYPER_MBIS) { err = mbi_init(handle, gic_data.domain); if (err) pr_err("Failed to initialize MBIs\n"); } set_handle_irq(gic_handle_irq); gic_update_rdist_properties(); gic_cpu_sys_reg_enable(); gic_prio_init(); gic_dist_init(); gic_cpu_init(); gic_enable_nmi_support(); gic_smp_init(); gic_cpu_pm_init(); if (gic_dist_supports_lpis()) { its_init(handle, &gic_data.rdists, gic_data.domain, dist_prio_irq); its_cpu_init(); its_lpi_memreserve_init(); } else { if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) gicv2m_init(handle, gic_data.domain); } return 0; out_free: if (gic_data.domain) irq_domain_remove(gic_data.domain); free_percpu(gic_data.rdists.rdist); return err; } static int __init gic_validate_dist_version(void __iomem *dist_base) { u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) return -ENODEV; return 0; } /* Create all possible partitions at boot time */ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) { struct device_node *parts_node, *child_part; int part_idx = 0, i; int nr_parts; struct partition_affinity *parts; parts_node = of_get_child_by_name(gic_node, "ppi-partitions"); if (!parts_node) return; gic_data.ppi_descs = kcalloc(gic_data.ppi_nr, sizeof(*gic_data.ppi_descs), GFP_KERNEL); if (!gic_data.ppi_descs) goto out_put_node; nr_parts = of_get_child_count(parts_node); if (!nr_parts) goto out_put_node; parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL); if (WARN_ON(!parts)) goto out_put_node; for_each_child_of_node(parts_node, child_part) { struct partition_affinity *part; int n; part = &parts[part_idx]; part->partition_id = of_fwnode_handle(child_part); pr_info("GIC: PPI partition %pOFn[%d] { ", child_part, part_idx); n = of_property_count_elems_of_size(child_part, "affinity", sizeof(u32)); WARN_ON(n <= 0); for (i = 0; i < n; i++) { int err, cpu; u32 cpu_phandle; struct device_node *cpu_node; err = of_property_read_u32_index(child_part, "affinity", i, &cpu_phandle); if (WARN_ON(err)) continue; cpu_node = of_find_node_by_phandle(cpu_phandle); if (WARN_ON(!cpu_node)) continue; cpu = of_cpu_node_to_id(cpu_node); if (WARN_ON(cpu < 0)) { of_node_put(cpu_node); continue; } pr_cont("%pOF[%d] ", cpu_node, cpu); cpumask_set_cpu(cpu, &part->mask); of_node_put(cpu_node); } pr_cont("}\n"); part_idx++; } for (i = 0; i < gic_data.ppi_nr; i++) { unsigned int irq; struct partition_desc *desc; struct irq_fwspec ppi_fwspec = { .fwnode = gic_data.fwnode, .param_count = 3, .param = { [0] = GIC_IRQ_TYPE_PARTITION, [1] = i, [2] = IRQ_TYPE_NONE, }, }; irq = irq_create_fwspec_mapping(&ppi_fwspec); if (WARN_ON(!irq)) continue; desc = partition_create_desc(gic_data.fwnode, parts, nr_parts, irq, &partition_domain_ops); if (WARN_ON(!desc)) continue; gic_data.ppi_descs[i] = desc; } out_put_node: of_node_put(parts_node); } static void __init gic_of_setup_kvm_info(struct device_node *node, u32 nr_redist_regions) { int ret; struct resource r; gic_v3_kvm_info.type = GIC_V3; gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0); if (!gic_v3_kvm_info.maint_irq) return; /* Also skip GICD, GICC, GICH */ ret = of_address_to_resource(node, nr_redist_regions + 3, &r); if (!ret) gic_v3_kvm_info.vcpu = r; gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; vgic_set_kvm_info(&gic_v3_kvm_info); } static void gic_request_region(resource_size_t base, resource_size_t size, const char *name) { if (!request_mem_region(base, size, name)) pr_warn_once(FW_BUG "%s region %pa has overlapping address\n", name, &base); } static void __iomem *gic_of_iomap(struct device_node *node, int idx, const char *name, struct resource *res) { void __iomem *base; int ret; ret = of_address_to_resource(node, idx, res); if (ret) return IOMEM_ERR_PTR(ret); gic_request_region(res->start, resource_size(res), name); base = of_iomap(node, idx); return base ?: IOMEM_ERR_PTR(-ENOMEM); } static int __init gic_of_init(struct device_node *node, struct device_node *parent) { phys_addr_t dist_phys_base; void __iomem *dist_base; struct redist_region *rdist_regs; struct resource res; u64 redist_stride; u32 nr_redist_regions; int err, i; dist_base = gic_of_iomap(node, 0, "GICD", &res); if (IS_ERR(dist_base)) { pr_err("%pOF: unable to map gic dist registers\n", node); return PTR_ERR(dist_base); } dist_phys_base = res.start; err = gic_validate_dist_version(dist_base); if (err) { pr_err("%pOF: no distributor detected, giving up\n", node); goto out_unmap_dist; } if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions)) nr_redist_regions = 1; rdist_regs = kcalloc(nr_redist_regions, sizeof(*rdist_regs), GFP_KERNEL); if (!rdist_regs) { err = -ENOMEM; goto out_unmap_dist; } for (i = 0; i < nr_redist_regions; i++) { rdist_regs[i].redist_base = gic_of_iomap(node, 1 + i, "GICR", &res); if (IS_ERR(rdist_regs[i].redist_base)) { pr_err("%pOF: couldn't map region %d\n", node, i); err = -ENODEV; goto out_unmap_rdist; } rdist_regs[i].phys_base = res.start; } if (of_property_read_u64(node, "redistributor-stride", &redist_stride)) redist_stride = 0; gic_enable_of_quirks(node, gic_quirks, &gic_data); err = gic_init_bases(dist_phys_base, dist_base, rdist_regs, nr_redist_regions, redist_stride, &node->fwnode); if (err) goto out_unmap_rdist; gic_populate_ppi_partitions(node); if (static_branch_likely(&supports_deactivate_key)) gic_of_setup_kvm_info(node, nr_redist_regions); return 0; out_unmap_rdist: for (i = 0; i < nr_redist_regions; i++) if (rdist_regs[i].redist_base && !IS_ERR(rdist_regs[i].redist_base)) iounmap(rdist_regs[i].redist_base); kfree(rdist_regs); out_unmap_dist: iounmap(dist_base); return err; } IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init); #ifdef CONFIG_ACPI static struct { void __iomem *dist_base; struct redist_region *redist_regs; u32 nr_redist_regions; bool single_redist; int enabled_rdists; u32 maint_irq; int maint_irq_mode; phys_addr_t vcpu_base; } acpi_data __initdata; static void __init gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base) { static int count = 0; acpi_data.redist_regs[count].phys_base = phys_base; acpi_data.redist_regs[count].redist_base = redist_base; acpi_data.redist_regs[count].single_redist = acpi_data.single_redist; count++; } static int __init gic_acpi_parse_madt_redist(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_redistributor *redist = (struct acpi_madt_generic_redistributor *)header; void __iomem *redist_base; redist_base = ioremap(redist->base_address, redist->length); if (!redist_base) { pr_err("Couldn't map GICR region @%llx\n", redist->base_address); return -ENOMEM; } if (acpi_get_madt_revision() >= 7 && (redist->flags & ACPI_MADT_GICR_NON_COHERENT)) gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; gic_request_region(redist->base_address, redist->length, "GICR"); gic_acpi_register_redist(redist->base_address, redist_base); return 0; } static int __init gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK; u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2; void __iomem *redist_base; /* Neither enabled or online capable means it doesn't exist, skip it */ if (!(gicc->flags & (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) return 0; /* * Capable but disabled CPUs can be brought online later. What about * the redistributor? ACPI doesn't want to say! * Virtual hotplug systems can use the MADT's "always-on" GICR entries. * Otherwise, prevent such CPUs from being brought online. */ if (!(gicc->flags & ACPI_MADT_ENABLED)) { int cpu = get_cpu_for_acpi_id(gicc->uid); pr_warn("CPU %u's redistributor is inaccessible: this CPU can't be brought online\n", cpu); if (cpu >= 0) cpumask_set_cpu(cpu, &broken_rdists); return 0; } redist_base = ioremap(gicc->gicr_base_address, size); if (!redist_base) return -ENOMEM; gic_request_region(gicc->gicr_base_address, size, "GICR"); if (acpi_get_madt_revision() >= 7 && (gicc->flags & ACPI_MADT_GICC_NON_COHERENT)) gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE; gic_acpi_register_redist(gicc->gicr_base_address, redist_base); return 0; } static int __init gic_acpi_collect_gicr_base(void) { acpi_tbl_entry_handler redist_parser; enum acpi_madt_type type; if (acpi_data.single_redist) { type = ACPI_MADT_TYPE_GENERIC_INTERRUPT; redist_parser = gic_acpi_parse_madt_gicc; } else { type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR; redist_parser = gic_acpi_parse_madt_redist; } /* Collect redistributor base addresses in GICR entries */ if (acpi_table_parse_madt(type, redist_parser, 0) > 0) return 0; pr_info("No valid GICR entries exist\n"); return -ENODEV; } static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header, const unsigned long end) { /* Subtable presence means that redist exists, that's it */ return 0; } static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; /* * If GICC is enabled and has valid gicr base address, then it means * GICR base is presented via GICC. The redistributor is only known to * be accessible if the GICC is marked as enabled. If this bit is not * set, we'd need to add the redistributor at runtime, which isn't * supported. */ if (gicc->flags & ACPI_MADT_ENABLED && gicc->gicr_base_address) acpi_data.enabled_rdists++; return 0; } static int __init gic_acpi_count_gicr_regions(void) { int count; /* * Count how many redistributor regions we have. It is not allowed * to mix redistributor description, GICR and GICC subtables have to be * mutually exclusive. */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR, gic_acpi_match_gicr, 0); if (count > 0) { acpi_data.single_redist = false; return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_match_gicc, 0); if (count > 0) { acpi_data.single_redist = true; count = acpi_data.enabled_rdists; } return count; } static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header, struct acpi_probe_entry *ape) { struct acpi_madt_generic_distributor *dist; int count; dist = (struct acpi_madt_generic_distributor *)header; if (dist->version != ape->driver_data) return false; /* We need to do that exercise anyway, the sooner the better */ count = gic_acpi_count_gicr_regions(); if (count <= 0) return false; acpi_data.nr_redist_regions = count; return true; } static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_interrupt *gicc = (struct acpi_madt_generic_interrupt *)header; int maint_irq_mode; static int first_madt = true; if (!(gicc->flags & (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) return 0; maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ? ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE; if (first_madt) { first_madt = false; acpi_data.maint_irq = gicc->vgic_interrupt; acpi_data.maint_irq_mode = maint_irq_mode; acpi_data.vcpu_base = gicc->gicv_base_address; return 0; } /* * The maintenance interrupt and GICV should be the same for every CPU */ if ((acpi_data.maint_irq != gicc->vgic_interrupt) || (acpi_data.maint_irq_mode != maint_irq_mode) || (acpi_data.vcpu_base != gicc->gicv_base_address)) return -EINVAL; return 0; } static bool __init gic_acpi_collect_virt_info(void) { int count; count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_parse_virt_madt_gicc, 0); return (count > 0); } #define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K) #define ACPI_GICV2_VCTRL_MEM_SIZE (SZ_4K) #define ACPI_GICV2_VCPU_MEM_SIZE (SZ_8K) static void __init gic_acpi_setup_kvm_info(void) { int irq; if (!gic_acpi_collect_virt_info()) { pr_warn("Unable to get hardware information used for virtualization\n"); return; } gic_v3_kvm_info.type = GIC_V3; irq = acpi_register_gsi(NULL, acpi_data.maint_irq, acpi_data.maint_irq_mode, ACPI_ACTIVE_HIGH); if (irq <= 0) return; gic_v3_kvm_info.maint_irq = irq; if (acpi_data.vcpu_base) { struct resource *vcpu = &gic_v3_kvm_info.vcpu; vcpu->flags = IORESOURCE_MEM; vcpu->start = acpi_data.vcpu_base; vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1; } gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; vgic_set_kvm_info(&gic_v3_kvm_info); } static struct fwnode_handle *gsi_domain_handle; static struct fwnode_handle *gic_v3_get_gsi_domain_id(u32 gsi) { return gsi_domain_handle; } static int __init gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_generic_distributor *dist; size_t size; int i, err; /* Get distributor base address */ dist = (struct acpi_madt_generic_distributor *)header; acpi_data.dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE); if (!acpi_data.dist_base) { pr_err("Unable to map GICD registers\n"); return -ENOMEM; } gic_request_region(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE, "GICD"); err = gic_validate_dist_version(acpi_data.dist_base); if (err) { pr_err("No distributor detected at @%p, giving up\n", acpi_data.dist_base); goto out_dist_unmap; } size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions; acpi_data.redist_regs = kzalloc(size, GFP_KERNEL); if (!acpi_data.redist_regs) { err = -ENOMEM; goto out_dist_unmap; } err = gic_acpi_collect_gicr_base(); if (err) goto out_redist_unmap; gsi_domain_handle = irq_domain_alloc_fwnode(&dist->base_address); if (!gsi_domain_handle) { err = -ENOMEM; goto out_redist_unmap; } err = gic_init_bases(dist->base_address, acpi_data.dist_base, acpi_data.redist_regs, acpi_data.nr_redist_regions, 0, gsi_domain_handle); if (err) goto out_fwhandle_free; acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, gic_v3_get_gsi_domain_id); if (static_branch_likely(&supports_deactivate_key)) gic_acpi_setup_kvm_info(); return 0; out_fwhandle_free: irq_domain_free_fwnode(gsi_domain_handle); out_redist_unmap: for (i = 0; i < acpi_data.nr_redist_regions; i++) if (acpi_data.redist_regs[i].redist_base) iounmap(acpi_data.redist_regs[i].redist_base); kfree(acpi_data.redist_regs); out_dist_unmap: iounmap(acpi_data.dist_base); return err; } IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3, gic_acpi_init); IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4, gic_acpi_init); IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE, gic_acpi_init); #endif
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0 */ /* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; u32 len; /* length of string in bytes */ struct mls_range range; char *str; /* string representation if context cannot be mapped. */ }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the high level of 'src'. */ static inline int mls_context_cpy_high(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_glblub(struct context *dst, const struct context *c1, const struct context *c2) { struct mls_range *dr = &dst->range; const struct mls_range *r1 = &c1->range, *r2 = &c2->range; int rc = 0; if (r1->level[1].sens < r2->level[0].sens || r2->level[1].sens < r1->level[0].sens) /* These ranges have no common sensitivities */ return -EINVAL; /* Take the greatest of the low */ dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens); /* Take the least of the high */ dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens); rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat, &r2->level[0].cat); if (rc) goto out; rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat, &r2->level[1].cat); if (rc) goto out; out: return rc; } static inline bool mls_context_equal(const struct context *c1, const struct context *c2) { return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_equal(&c1->range.level[0].cat, &c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_equal(&c1->range.level[1].cat, &c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, const struct context *src) { int rc; dst->user = src->user; dst->role = src->role; dst->type = src->type; if (src->str) { dst->str = kstrdup(src->str, GFP_ATOMIC); if (!dst->str) return -ENOMEM; dst->len = src->len; } else { dst->str = NULL; dst->len = 0; } rc = mls_context_cpy(dst, src); if (rc) { kfree(dst->str); dst->str = NULL; dst->len = 0; return rc; } return 0; } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; kfree(c->str); c->str = NULL; c->len = 0; mls_context_destroy(c); } static inline bool context_equal(const struct context *c1, const struct context *c2) { if (c1->len && c2->len) return (c1->len == c2->len && !strcmp(c1->str, c2->str)); if (c1->len || c2->len) return 0; return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_equal(c1, c2)); } u32 context_compute_hash(const struct context *c); #endif /* _SS_CONTEXT_H_ */
161 210 211 211 210 216 217 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } void __fsnotify_mntns_delete(struct mnt_namespace *mntns) { fsnotify_clear_marks_by_mntns(mntns); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); /* Were any marks ever added to any object on this sb? */ if (!sbinfo) return; fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(fsnotify_sb_watched_objects(sb), !atomic_long_read(fsnotify_sb_watched_objects(sb))); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT)); WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT)); } void fsnotify_sb_free(struct super_block *sb) { kfree(sb->s_fsnotify_info); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void fsnotify_set_children_dentry_flags(struct inode *inode) { struct dentry *alias; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* * Lazily clear false positive PARENT_WATCHED flag for child whose parent had * stopped watching children. */ static void fsnotify_clear_child_dentry_flag(struct inode *pinode, struct dentry *dentry) { spin_lock(&dentry->d_lock); /* * d_lock is a sufficient barrier to prevent observing a non-watched * parent state from before the fsnotify_set_children_dentry_flags() * or fsnotify_update_flags() call that had set PARENT_WATCHED. */ if (!fsnotify_inode_watches_children(pinode)) dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&dentry->d_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask( READ_ONCE(inode->i_sb->s_fsnotify_mask)); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that watch for these events? */ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* Report pre-content event with optional range info */ int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { struct file_range range; /* Report page aligned range only when pos is known */ if (!ppos) return fsnotify_path(path, FS_PRE_ACCESS); range.path = path; range.pos = PAGE_ALIGN_DOWN(*ppos); range.count = PAGE_ALIGN(*ppos + count) - range.pos; return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, FSNOTIFY_EVENT_FILE_RANGE); } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) fsnotify_clear_child_dentry_flag(p_inode, dentry); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; /* * Some events can be sent on both parent dir and child marks (e.g. * FS_ATTRIB). If both parent dir and child are watching, report the * event once to parent dir with name (if interested) and once to child * without name (if interested). * * In any case regardless whether the parent is watching or not, the * child watcher is expecting an event without the FS_EVENT_ON_CHILD * flag. The file name is expected if and only if this is a directory * event. */ mask &= ~FS_EVENT_ON_CHILD; if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks) && (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; if (sb) marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); if (mnt_data) marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if (sbinfo) { iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sbinfo->sb_marks); } if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } if (mnt_data) { iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * At open time we check fsnotify_sb_has_priority_watchers() and set the * FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ void file_set_fsnotify_mode_from_watchers(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; __u32 mnt_mask, p_mask; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) return; /* * Permission events is a super set of pre-content events, so if there * are no permission event watchers, there are also no pre-content event * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return; } /* * If there are permission event watchers but no pre-content event * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. */ if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); return; } /* * OK, there are some pre-content watchers. Check if anybody is * watching for pre-content events on *this* file. */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, FSNOTIFY_PRE_CONTENT_EVENTS))) { /* Enable pre-content events */ file_set_fsnotify_mode(file, 0); return; } /* Is parent watching for pre-content events on this file? */ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask = fsnotify_inode_watches_children(d_inode(parent)); dput(parent); if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { /* Enable pre-content events */ file_set_fsnotify_mode(file, 0); return; } } /* Nobody watching for pre-content events from this file */ file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); } #endif void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) { struct fsnotify_mnt data = { .ns = ns, .mnt_id = real_mount(mnt)->mnt_id_unique, }; if (WARN_ON_ONCE(!ns)) return; /* * This is an optimization as well as making sure fsnotify_init() has * been called. */ if (!ns->n_fsnotify_marks) return; fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); } static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
7 5 2 2 1 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); }
319 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_ #define _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_ /** * fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ static __always_inline int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } #endif
109 109 109 52 52 86 86 86 103 103 51 51 51 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the diskquota system for the LINUX operating system. QUOTA * is implemented using the BSD system call interface as the means of * communication with the user level. This file contains the generic routines * called by the different filesystems on allocation of an inode or block. * These routines take care of the administration needed to have a consistent * diskquota tracking system. The ideas of both user and group quotas are based * on the Melbourne quota system as used on BSD derived systems. The internal * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 * * Revised list management to avoid races * -- Bill Hawes, <whawes@star.net>, 9/98 * * Fixed races in dquot_transfer(), dqget() and dquot_alloc_...(). * As the consequence the locking was moved from dquot_decr_...(), * dquot_incr_...() to calling functions. * invalidate_dquots() now writes modified dquots. * Serialized quota_off() and quota_on() for mount point. * Fixed a few bugs in grow_dquots(). * Fixed deadlock in write_dquot() - we no longer account quotas on * quota files * remove_dquot_ref() moved to inode.c - it now traverses through inodes * add_dquot_ref() restarts after blocking * Added check for bogus uid and fixed check for group in quotactl. * Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99 * * Used struct list_head instead of own list struct * Invalidation of referenced dquots is no longer possible * Improved free_dquots list management * Quota and i_blocks are now updated in one place to avoid races * Warnings are now delayed so we won't block in critical section * Write updated not to require dquot lock * Jan Kara, <jack@suse.cz>, 9/2000 * * Added dynamic quota structure allocation * Jan Kara <jack@suse.cz> 12/2000 * * Rewritten quota interface. Implemented new quota format and * formats registering. * Jan Kara, <jack@suse.cz>, 2001,2002 * * New SMP locking. * Jan Kara, <jack@suse.cz>, 10/2002 * * Added journalled quota support, fix lock inversion problems * Jan Kara, <jack@suse.cz>, 2003,2004 * * (C) Copyright 1994 - 1997 Marco van Wieringen */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/mm.h> #include <linux/time.h> #include <linux/types.h> #include <linux/string.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/tty.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/kmod.h> #include <linux/namei.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> /* * There are five quota SMP locks: * * dq_list_lock protects all lists with quotas and quota formats. * * dquot->dq_dqb_lock protects data from dq_dqb * * inode->i_lock protects inode->i_blocks, i_bytes and also guards * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that * dquot_transfer() can stabilize amount it transfers * * dq_data_lock protects mem_dqinfo structures and modifications of dquot * pointers in the inode * * dq_state_lock protects modifications of quota state (on quotaon and * quotaoff) and readers who care about latest values take it as well. * * The spinlock ordering is hence: * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * * Operation accessing dquots via inode pointers are protected by dquot_srcu. * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and * synchronize_srcu(&dquot_srcu) is called after clearing pointers from * inode and before dropping dquot references to avoid use of dquots after * they are freed. dq_data_lock is used to serialize the pointer setting and * clearing operations. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they * have to do all pointer modifications before dropping dq_data_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to * memory (or space for it is being allocated) on the first dqget(), when it is * being written out, and when it is being released on the last dqput(). The * allocation and release operations are serialized by the dq_lock and by * checking the use count in dquot_release(). * * Lock ordering (including related VFS locks) is the following: * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); DEFINE_STATIC_SRCU(dquot_srcu); static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq); void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) { if (printk_ratelimit()) { va_list args; struct va_format vaf; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_ERR "Quota error (device %s): %s: %pV\n", sb->s_id, func, &vaf); va_end(args); } } EXPORT_SYMBOL(__quota_error); #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; #endif static struct quota_format_type *quota_formats; /* List of registered formats */ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; /* SLAB cache for dquot structures */ static struct kmem_cache *dquot_cachep; void register_quota_format(struct quota_format_type *fmt) { spin_lock(&dq_list_lock); fmt->qf_next = quota_formats; quota_formats = fmt; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(register_quota_format); void unregister_quota_format(struct quota_format_type *fmt) { struct quota_format_type **actqf; spin_lock(&dq_list_lock); for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next) ; if (*actqf) *actqf = (*actqf)->qf_next; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(unregister_quota_format); static struct quota_format_type *find_quota_format(int id) { struct quota_format_type *actqf; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; spin_unlock(&dq_list_lock); for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++) ; if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name)) return NULL; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (actqf && !try_module_get(actqf->qf_owner)) actqf = NULL; } spin_unlock(&dq_list_lock); return actqf; } static void put_quota_format(struct quota_format_type *fmt) { module_put(fmt->qf_owner); } /* * Dquot List Management: * The quota code uses five lists for dquot management: the inuse_list, * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array. * A single dquot structure may be on some of those lists, depending on * its current state. * * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * * When the last reference of a dquot is dropped, the dquot is added to * releasing_dquots. We'll then queue work item which will call * synchronize_srcu() and after that perform the final cleanup of all the * dquots on the list. Each cleaned up dquot is moved to free_dquots list. * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot * struct. * * Unused and cleaned up dquots are in the free_dquots list and this list is * searched whenever we need an available dquot. Dquots are removed from the * list as soon as they are used again and dqstats.free_dquots gives the number * of dquots on the list. When dquot is invalidated it's completely released * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to * quota file. Note that some filesystems do dirty dquot tracking on their * own (e.g. in a journal) and thus don't use dqi_dirty_list. * * Dquots with a specific identity (device, type and id) are placed on * one of the dquot_hash[] hash chains. The provides an efficient search * mechanism to locate a specific dquot. */ static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static LIST_HEAD(releasing_dquots); static unsigned int dq_hash_bits, dq_hash_mask; static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); static void quota_release_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn); static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) { unsigned int id = from_kqid(&init_user_ns, qid); int type = qid.type; unsigned long tmp; tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask; } /* * Following list functions expect dq_list_lock to be held */ static inline void insert_dquot_hash(struct dquot *dquot) { struct hlist_head *head; head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id); hlist_add_head(&dquot->dq_hash, head); } static inline void remove_dquot_hash(struct dquot *dquot) { hlist_del_init(&dquot->dq_hash); } static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, struct kqid qid) { struct dquot *dquot; hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash) if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; return NULL; } /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &free_dquots); dqstats_inc(DQST_FREE_DQUOTS); } static inline void put_releasing_dquots(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &releasing_dquots); set_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void remove_free_dquot(struct dquot *dquot) { if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) dqstats_dec(DQST_FREE_DQUOTS); else clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats_inc(DQST_ALLOC_DQUOTS); } static inline void remove_inuse(struct dquot *dquot) { dqstats_dec(DQST_ALLOC_DQUOTS); list_del(&dquot->dq_inuse); } /* * End of list functions needing dq_list_lock */ static void wait_on_dquot(struct dquot *dquot) { mutex_lock(&dquot->dq_lock); mutex_unlock(&dquot->dq_lock); } static inline int dquot_active(struct dquot *dquot) { return test_bit(DQ_ACTIVE_B, &dquot->dq_flags); } static inline int dquot_dirty(struct dquot *dquot) { return test_bit(DQ_MOD_B, &dquot->dq_flags); } static inline int mark_dquot_dirty(struct dquot *dquot) { return dquot->dq_sb->dq_op->mark_dirty(dquot); } /* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; if (!dquot_active(dquot)) return 0; if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); /* If quota is dirty already, we don't have to acquire dq_list_lock */ if (dquot_dirty(dquot)) return 1; spin_lock(&dq_list_lock); if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> info[dquot->dq_id.type].dqi_dirty_list); ret = 0; } spin_unlock(&dq_list_lock); return ret; } EXPORT_SYMBOL(dquot_mark_dquot_dirty); /* Dirtify all the dquots - this can block when journalling */ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots) { int ret, err, cnt; struct dquot *dquot; ret = err = 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) /* Even in case of error we have to continue */ ret = mark_dquot_dirty(dquot); if (!err && ret < 0) err = ret; } return err; } static inline void dqput_all(struct dquot **dquot) { unsigned int cnt; for (cnt = 0; cnt < MAXQUOTAS; cnt++) dqput(dquot[cnt]); } static inline int clear_dquot_dirty(struct dquot *dquot) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags); spin_lock(&dq_list_lock); if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); return 0; } list_del_init(&dquot->dq_dirty); spin_unlock(&dq_list_lock); return 1; } void mark_info_dirty(struct super_block *sb, int type) { spin_lock(&dq_data_lock); sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY; spin_unlock(&dq_data_lock); } EXPORT_SYMBOL(mark_info_dirty); /* * Read dquot from disk and alloc space for it */ int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) goto out_iolock; } /* Make sure flags update is visible after dquot has been filled */ smp_mb__before_atomic(); set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ if (!dquot_active(dquot) && !dquot->dq_off) { ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret < 0) goto out_iolock; if (ret2 < 0) { ret = ret2; goto out_iolock; } } /* * Make sure flags update is visible after on-disk struct has been * allocated. Paired with smp_rmb() in dqget(). */ smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_acquire); /* * Write dquot to disk */ int dquot_commit(struct dquot *dquot) { int ret = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (dquot_active(dquot)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; out_lock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_commit); /* * Release dquot */ int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret >= 0) ret = ret2; } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_release); void dquot_destroy(struct dquot *dquot) { kmem_cache_free(dquot_cachep, dquot); } EXPORT_SYMBOL(dquot_destroy); static inline void do_destroy_dquot(struct dquot *dquot) { dquot->dq_sb->dq_op->destroy_dquot(dquot); } /* Invalidate all dquots on the list. Note that this function is called after * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being * just deleted or pruned by prune_icache() (those are not attached to any * list) or parallel quotactl call. We have to wait for such users. */ static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot, *tmp; restart: flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) continue; if (dquot->dq_id.type != type) continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* * Once dqput() wakes us up, we know it's time to free * the dquot. * IMPORTANT: we rely on the fact that there is always * at most one process waiting for dquot to free. * Otherwise dq_count would be > 1 and we would never * wake up. */ wait_event(dquot_ref_wq, atomic_read(&dquot->dq_count) == 1); dqput(dquot); /* At this moment dquot() need not exist (it could be * reclaimed by prune_dqcache(). Hence we must * restart. */ goto restart; } /* * The last user already dropped its reference but dquot didn't * get fully cleaned up yet. Restart the scan which flushes the * work cleaning up released dquots. */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); goto restart; } /* * Quota now has no users and it has been written on last * dqput() */ remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); } spin_unlock(&dq_list_lock); } /* Call callback for every active dquot on given filesystem */ int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv) { struct dquot *dquot, *old_dquot = NULL; int ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { if (!dquot_active(dquot)) continue; if (dquot->dq_sb != sb) continue; /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqput(old_dquot); old_dquot = dquot; /* * ->release_dquot() can be racing with us. Our reference * protects us from new calls to it so just wait for any * outstanding call and recheck the DQ_ACTIVE_B after that. */ wait_on_dquot(dquot); if (dquot_active(dquot)) { ret = fn(dquot, priv); if (ret < 0) goto out; } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ } spin_unlock(&dq_list_lock); out: dqput(old_dquot); return ret; } EXPORT_SYMBOL(dquot_scan_active); static inline int dquot_write_dquot(struct dquot *dquot) { int ret = dquot->dq_sb->dq_op->write_dquot(dquot); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota structure " "(error %d). Quota may get out of sync!", ret); /* Clear dirty bit anyway to avoid infinite loop. */ clear_dquot_dirty(dquot); } return ret; } /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { struct list_head dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; int err, ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); flush_delayed_work(&quota_release_work); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); /* Move list away to avoid livelock. */ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty); while (!list_empty(&dirty)) { dquot = list_first_entry(&dirty, struct dquot, dq_dirty); WARN_ON(!dquot_active(dquot)); /* If the dquot is releasing we should not touch it */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); continue; } /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); err = dquot_write_dquot(dquot); if (err && !ret) ret = err; dqput(dquot); spin_lock(&dq_list_lock); } spin_unlock(&dq_list_lock); } for (cnt = 0; cnt < MAXQUOTAS; cnt++) if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); dqstats_inc(DQST_SYNCS); return ret; } EXPORT_SYMBOL(dquot_writeback_dquots); /* Write all dquot structures to disk and make them visible from userspace */ int dquot_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret; ret = dquot_writeback_dquots(sb, type); if (ret) return ret; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) return 0; /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ if (sb->s_op->sync_fs) { ret = sb->s_op->sync_fs(sb, 1); if (ret) return ret; } ret = sync_blockdev(sb->s_bdev); if (ret) return ret; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } return 0; } EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); while (!list_empty(&free_dquots) && sc->nr_to_scan) { dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; } spin_unlock(&dq_list_lock); return freed; } static unsigned long dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio( percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } /* * Safely release dquot and put reference to dquot. */ static void quota_release_workfn(struct work_struct *work) { struct dquot *dquot; struct list_head rls_head; spin_lock(&dq_list_lock); /* Exchange the list head to avoid livelock. */ list_replace_init(&releasing_dquots, &rls_head); spin_unlock(&dq_list_lock); synchronize_srcu(&dquot_srcu); restart: spin_lock(&dq_list_lock); while (!list_empty(&rls_head)) { dquot = list_first_entry(&rls_head, struct dquot, dq_free); WARN_ON_ONCE(atomic_read(&dquot->dq_count)); /* * Note that DQ_RELEASING_B protects us from racing with * invalidate_dquots() calls so we are safe to work with the * dquot even after we drop dq_list_lock. */ if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ dquot_write_dquot(dquot); goto restart; } if (dquot_active(dquot)) { spin_unlock(&dq_list_lock); dquot->dq_sb->dq_op->release_dquot(dquot); goto restart; } /* Dquot is inactive and clean, now move it to free list */ remove_free_dquot(dquot); put_dquot_last(dquot); } spin_unlock(&dq_list_lock); } /* * Put reference to dquot */ void dqput(struct dquot *dquot) { if (!dquot) return; #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", quotatypes[dquot->dq_id.type], from_kqid(&init_user_ns, dquot->dq_id)); BUG(); } #endif dqstats_inc(DQST_DROPS); spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot_ref_wq); spin_unlock(&dq_list_lock); return; } /* Need to release dquot? */ WARN_ON_ONCE(!list_empty(&dquot->dq_free)); put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); queue_delayed_work(system_unbound_wq, &quota_release_work, 1); } EXPORT_SYMBOL(dqput); struct dquot *dquot_alloc(struct super_block *sb, int type) { return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); } EXPORT_SYMBOL(dquot_alloc); static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; dquot = sb->dq_op->alloc_dquot(sb, type); if(!dquot) return NULL; mutex_init(&dquot->dq_lock); INIT_LIST_HEAD(&dquot->dq_free); INIT_LIST_HEAD(&dquot->dq_inuse); INIT_HLIST_NODE(&dquot->dq_hash); INIT_LIST_HEAD(&dquot->dq_dirty); dquot->dq_sb = sb; dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); spin_lock_init(&dquot->dq_dqb_lock); return dquot; } /* * Get reference to dquot * * Locking is slightly tricky here. We are guarded from parallel quotaoff() * destroying our dquot by: * a) checking for quota flags under dq_list_lock and * b) getting a reference to dquot before we release dq_list_lock */ struct dquot *dqget(struct super_block *sb, struct kqid qid) { unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; if (!qid_has_mapping(sb->s_user_ns, qid)) return ERR_PTR(-EINVAL); if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); dquot = ERR_PTR(-ESRCH); goto out; } spin_unlock(&dq_state_lock); dquot = find_dquot(hashent, sb, qid); if (!dquot) { if (!empty) { spin_unlock(&dq_list_lock); empty = get_empty_dquot(sb, qid.type); if (!empty) schedule(); /* Try to wait for a moment... */ goto we_slept; } dquot = empty; empty = NULL; dquot->dq_id = qid; /* all dquots go on the inuse_list */ put_inuse(dquot); /* hash it first so it can be found */ insert_dquot_hash(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); } else { if (!atomic_read(&dquot->dq_count)) remove_free_dquot(dquot); atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqstats_inc(DQST_CACHE_HITS); dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ if (!dquot_active(dquot)) { int err; err = sb->dq_op->acquire_dquot(dquot); if (err < 0) { dqput(dquot); dquot = ERR_PTR(err); goto out; } } /* * Make sure following reads see filled structure - paired with * smp_mb__before_atomic() in dquot_acquire(). */ smp_rmb(); /* Has somebody invalidated entry under us? */ WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash)); out: if (empty) do_destroy_dquot(empty); return dquot; } EXPORT_SYMBOL(dqget); static inline struct dquot __rcu **i_dquot(struct inode *inode) { return inode->i_sb->s_op->get_dquots(inode); } static int dqinit_needed(struct inode *inode, int type) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return 0; dquots = i_dquot(inode); if (type != -1) return !dquots[type]; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!dquots[cnt]) return 1; return 0; } /* This routine is guarded by s_umount semaphore */ static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif int err = 0; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif iput(old_inode); err = __dquot_initialize(inode, type); if (err) { iput(inode); goto out; } /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " "thus quota information is probably inconsistent. " "Please run quotacheck(8)"); } #endif return err; } static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking * (dq_data_lock). */ spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { struct dquot __rcu **dquots = i_dquot(inode); struct dquot *dquot = srcu_dereference_check( dquots[type], &dquot_srcu, lockdep_is_held(&dq_data_lock)); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif rcu_assign_pointer(dquots[type], NULL); if (dquot) dqput(dquot); } spin_unlock(&dq_data_lock); } spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" " was disabled thus quota information is probably " "inconsistent. Please run quotacheck(8).\n", sb->s_id); } #endif } /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { if (sb->dq_op) remove_dquot_ref(sb, type); } static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { if (dquot->dq_dqb.dqb_rsvspace >= number) dquot->dq_dqb.dqb_rsvspace -= number; else { WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curinodes >= number) dquot->dq_dqb.dqb_curinodes -= number; else dquot->dq_dqb.dqb_curinodes = 0; if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) dquot->dq_dqb.dqb_itime = (time64_t) 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } static void dquot_decr_space(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curspace >= number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } struct dquot_warn { struct super_block *w_sb; struct kqid w_dq_id; short w_type; }; static int warning_issued(struct dquot *dquot, const int warntype) { int flag = (warntype == QUOTA_NL_BHARDWARN || warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B : ((warntype == QUOTA_NL_IHARDWARN || warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0); if (!flag) return 0; return test_and_set_bit(flag, &dquot->dq_flags); } #ifdef CONFIG_PRINT_QUOTA_WARNING static int flag_print_warnings = 1; static int need_print_warning(struct dquot_warn *warn) { if (!flag_print_warnings) return 0; switch (warn->w_dq_id.type) { case USRQUOTA: return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: return in_group_p(warn->w_dq_id.gid); case PRJQUOTA: return 1; } return 0; } /* Print warning to user which exceeded quota */ static void print_warning(struct dquot_warn *warn) { char *msg = NULL; struct tty_struct *tty; int warntype = warn->w_type; if (warntype == QUOTA_NL_IHARDBELOW || warntype == QUOTA_NL_ISOFTBELOW || warntype == QUOTA_NL_BHARDBELOW || warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) return; tty = get_current_tty(); if (!tty) return; tty_write_message(tty, warn->w_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); tty_write_message(tty, quotatypes[warn->w_dq_id.type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; break; case QUOTA_NL_ISOFTLONGWARN: msg = " file quota exceeded too long.\r\n"; break; case QUOTA_NL_ISOFTWARN: msg = " file quota exceeded.\r\n"; break; case QUOTA_NL_BHARDWARN: msg = " block limit reached.\r\n"; break; case QUOTA_NL_BSOFTLONGWARN: msg = " block quota exceeded too long.\r\n"; break; case QUOTA_NL_BSOFTWARN: msg = " block quota exceeded.\r\n"; break; } tty_write_message(tty, msg); tty_kref_put(tty); } #endif static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, int warntype) { if (warning_issued(dquot, warntype)) return; warn->w_type = warntype; warn->w_sb = dquot->dq_sb; warn->w_dq_id = dquot->dq_id; } /* * Write warnings to the console and send warning messages over netlink. * * Note that this function can call into tty and networking code. */ static void flush_warnings(struct dquot_warn *warn) { int i; for (i = 0; i < MAXQUOTAS; i++) { if (warn[i].w_type == QUOTA_NL_NOWARN) continue; #ifdef CONFIG_PRINT_QUOTA_WARNING print_warning(&warn[i]); #endif quota_send_warning(warn[i].w_dq_id, warn[i].w_sb->s_dev, warn[i].w_type); } } static int ignore_hardlimit(struct dquot *dquot) { struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & DQF_ROOT_SQUASH)); } static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes, struct dquot_warn *warn) { qsize_t newinodes; int ret = 0; spin_lock(&dquot->dq_dqb_lock); newinodes = dquot->dq_dqb.dqb_curinodes + inodes; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto add; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } add: dquot->dq_dqb.dqb_curinodes = newinodes; out: spin_unlock(&dquot->dq_dqb_lock); return ret; } static int dquot_add_space(struct dquot *dquot, qsize_t space, qsize_t rsv_space, unsigned int flags, struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; int ret = 0; spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (flags & DQUOT_SPACE_WARN) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; } else { /* * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ ret = -EDQUOT; goto finish; } } finish: /* * We have to be careful and go through warning generation & grace time * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it * only here... */ if (flags & DQUOT_SPACE_NOFAIL) ret = 0; if (!ret) { dquot->dq_dqb.dqb_rsvspace += rsv_space; dquot->dq_dqb.dqb_curspace += space; } spin_unlock(&dquot->dq_dqb_lock); return ret; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) { qsize_t newinodes; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type)) return QUOTA_NL_NOWARN; newinodes = dquot->dq_dqb.dqb_curinodes - inodes; if (newinodes <= dquot->dq_dqb.dqb_isoftlimit) return QUOTA_NL_ISOFTBELOW; if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && newinodes < dquot->dq_dqb.dqb_ihardlimit) return QUOTA_NL_IHARDBELOW; return QUOTA_NL_NOWARN; } static int info_bdq_free(struct dquot *dquot, qsize_t space) { qsize_t tspace; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; if (tspace >= dquot->dq_dqb.dqb_bhardlimit && tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } static int inode_quota_active(const struct inode *inode) { struct super_block *sb = inode->i_sb; if (IS_NOQUOTA(inode)) return 0; return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* * Initialize quota pointers in inode * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; struct dquot __rcu **dquots; struct dquot *got[MAXQUOTAS] = {}; struct super_block *sb = inode->i_sb; qsize_t rsv; int ret = 0; if (!inode_quota_active(inode)) return 0; dquots = i_dquot(inode); /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; kprojid_t projid; int rc; struct dquot *dquot; if (type != -1 && cnt != type) continue; /* * The i_dquot should have been initialized in most cases, * we check it without locking here to avoid unnecessary * dqget()/dqput() calls. */ if (dquots[cnt]) continue; if (!sb_has_quota_active(sb, cnt)) continue; init_needed = 1; switch (cnt) { case USRQUOTA: qid = make_kqid_uid(inode->i_uid); break; case GRPQUOTA: qid = make_kqid_gid(inode->i_gid); break; case PRJQUOTA: rc = inode->i_sb->dq_op->get_projid(inode, &projid); if (rc) continue; qid = make_kqid_projid(projid); break; } dquot = dqget(sb, qid); if (IS_ERR(dquot)) { /* We raced with somebody turning quotas off... */ if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } got[cnt] = dquot; } /* All required i_dquot has been initialized */ if (!init_needed) return 0; spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) goto out_lock; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(sb, cnt)) continue; /* We could race with quotaon or dqget() could have failed */ if (!got[cnt]) continue; if (!dquots[cnt]) { rcu_assign_pointer(dquots[cnt], got[cnt]); got[cnt] = NULL; /* * Make quota reservation system happy if someone * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) { struct dquot *dquot = srcu_dereference_check( dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); spin_lock(&inode->i_lock); /* Get reservation again under proper lock */ rsv = __inode_get_rsv_space(inode); spin_lock(&dquot->dq_dqb_lock); dquot->dq_dqb.dqb_rsvspace += rsv; spin_unlock(&dquot->dq_dqb_lock); spin_unlock(&inode->i_lock); } } } out_lock: spin_unlock(&dq_data_lock); out_put: /* Drop unused references */ dqput_all(got); return ret; } int dquot_initialize(struct inode *inode) { return __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); bool dquot_initialize_needed(struct inode *inode) { struct dquot __rcu **dquots; int i; if (!inode_quota_active(inode)) return false; dquots = i_dquot(inode); for (i = 0; i < MAXQUOTAS; i++) if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) return true; return false; } EXPORT_SYMBOL(dquot_initialize_needed); /* * Release all quotas referenced by inode. * * This function only be called on inode free or converting * a file to quota file, no other users for the i_dquot in * both cases, so we needn't call synchronize_srcu() after * clearing i_dquot. */ static void __dquot_drop(struct inode *inode) { int cnt; struct dquot __rcu **dquots = i_dquot(inode); struct dquot *put[MAXQUOTAS]; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); rcu_assign_pointer(dquots[cnt], NULL); } spin_unlock(&dq_data_lock); dqput_all(put); } void dquot_drop(struct inode *inode) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return; /* * Test before calling to rule out calls from proc and such * where we are not allowed to block. Note that this is * actually reliable test even without the lock - the caller * must assure that nobody can come after the DQUOT_DROP and * add quota pointers back anyway. */ dquots = i_dquot(inode); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (dquots[cnt]) break; } if (cnt < MAXQUOTAS) __dquot_drop(inode); } EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by * i_lock similar to i_blocks+i_bytes. */ static qsize_t *inode_reserved_space(struct inode * inode) { /* Filesystem must explicitly define it's own method in order to use * quota reservation interface */ BUG_ON(!inode->i_sb->dq_op->get_reserved_space); return inode->i_sb->dq_op->get_reserved_space(inode); } static qsize_t __inode_get_rsv_space(struct inode *inode) { if (!inode->i_sb->dq_op->get_reserved_space) return 0; return *inode_reserved_space(inode); } static qsize_t inode_get_rsv_space(struct inode *inode) { qsize_t ret; if (!inode->i_sb->dq_op->get_reserved_space) return 0; spin_lock(&inode->i_lock); ret = __inode_get_rsv_space(inode); spin_unlock(&inode->i_lock); return ret; } /* * This functions updates i_blocks+i_bytes fields and quota information * (together with appropriate checks). * * NOTE: We absolutely rely on the fact that caller dirties the inode * (usually helpers in quotaops.h care about this) and holds a handle for * the current transaction so that dquot write and inode write go into the * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; int reserve = flags & DQUOT_SPACE_RESERVE; struct dquot __rcu **dquots; struct dquot *dquot; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } else { inode_add_bytes(inode, number); } goto out; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; if (reserve) { ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]); } else { ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]); } if (ret) { /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); goto out_flush_warn; } } if (reserve) *inode_reserved_space(inode) += number; else __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; ret = mark_all_dquot_dirty(dquots); out_flush_warn: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); out: return ret; } EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; if (!inode_quota_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; ret = dquot_add_inodes(dquot, 1, &warn[cnt]); if (ret) { for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; /* Back out changes we already did */ spin_lock(&dquot->dq_dqb_lock); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } goto warn_put_all; } } warn_put_all: spin_unlock(&inode->i_lock); if (ret == 0) ret = mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); /* * Convert in-memory reserved quotas to real consumed quotas */ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number)) number = dquot->dq_dqb.dqb_rsvspace; dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * Convert allocated space back to in-memory reserved quotas */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) number = dquot->dq_dqb.dqb_curspace; dquot->dq_dqb.dqb_rsvspace += number; dquot->dq_dqb.dqb_curspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); /* * This operation can block, but only after everything is updated */ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu **dquots; struct dquot *dquot; int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } else { inode_sub_bytes(inode, number); } return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_bdq_free(dquot, number); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } if (reserve) *inode_reserved_space(inode) -= number; else __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_unlock; mark_all_dquot_dirty(dquots); out_unlock: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ void dquot_free_inode(struct inode *inode) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; int index; if (!inode_quota_active(inode)) return; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_idq_free(dquot, 1); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); /* * Transfer the number of inode and blocks from one diskquota to an other. * On success, dquot references in transfer_to are consumed and references * to original dquots that need to be released are placed there. On failure, * references are kept untouched. * * This operation can block, but only after everything is updated * A transaction must be started when entering this function. * * We are holding reference on transfer_from & transfer_to, no need to * protect them by srcu_read_lock(). */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t cur_space; qsize_t rsv_space = 0; qsize_t inode_usage = 1; struct dquot __rcu **dquots; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, index, ret = 0, err; char is_valid[MAXQUOTAS] = {}; struct dquot_warn warn_to[MAXQUOTAS]; struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; if (IS_NOQUOTA(inode)) return 0; if (inode->i_sb->dq_op->get_inode_usage) { ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); if (ret) return ret; } /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } spin_lock(&dq_data_lock); spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); return 0; } cur_space = __inode_get_bytes(inode); rsv_space = __inode_get_rsv_space(inode); dquots = i_dquot(inode); /* * Build the transfer_from list, check limits, and update usage in * the target structures. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* * Skip changes for same uid or gid or for turned off quota-type. */ if (!transfer_to[cnt]) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(inode->i_sb, cnt)) continue; is_valid[cnt] = 1; transfer_from[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); ret = dquot_add_inodes(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, DQUOT_SPACE_WARN, &warn_to[cnt]); if (ret) { spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } /* Decrease usage for source structures and update quota pointers */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!is_valid[cnt]) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; spin_lock(&transfer_from[cnt]->dq_dqb_lock); wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); wtype = info_bdq_free(transfer_from[cnt], cur_space + rsv_space); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); spin_unlock(&transfer_from[cnt]->dq_dqb_lock); } rcu_assign_pointer(dquots[cnt], transfer_to[cnt]); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); /* * These arrays are local and we hold dquot references so we don't need * the srcu protection but still take dquot_srcu to avoid warning in * mark_all_dquot_dirty(). */ index = srcu_read_lock(&dquot_srcu); err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from); if (err < 0) ret = err; err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to); if (err < 0) ret = err; srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn_to); flush_warnings(warn_from_inodes); flush_warnings(warn_from_space); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) transfer_to[cnt] = transfer_from[cnt]; return ret; over_quota: /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { if (!is_valid[cnt]) continue; spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); dquot_decr_space(transfer_to[cnt], cur_space); dquot_free_reserved_space(transfer_to[cnt], rsv_space); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); flush_warnings(warn_to); return ret; } EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; struct super_block *sb = inode->i_sb; int ret; if (!inode_quota_active(inode)) return 0; if (i_uid_needs_update(idmap, iattr, inode)) { kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[USRQUOTA] = dquot; } if (i_gid_needs_update(idmap, iattr, inode)) { kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[GRPQUOTA] = dquot; } ret = __dquot_transfer(inode, transfer_to); out_put: dqput_all(transfer_to); return ret; } EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk */ int dquot_commit_info(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); return dqopt->ops[type]->write_file_info(sb, type); } EXPORT_SYMBOL(dquot_commit_info); int dquot_get_next_id(struct super_block *sb, struct kqid *qid) { struct quota_info *dqopt = sb_dqopt(sb); if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; if (!dqopt->ops[qid->type]->get_next_id) return -ENOSYS; return dqopt->ops[qid->type]->get_next_id(sb, qid); } EXPORT_SYMBOL(dquot_get_next_id); /* * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, .mark_dirty = dquot_mark_dquot_dirty, .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_next_id = dquot_get_next_id, }; EXPORT_SYMBOL(dquot_operations); /* * Generic helper for ->open on filesystems supporting disk quotas. */ int dquot_file_open(struct inode *inode, struct file *file) { int error; error = generic_file_open(inode, file); if (!error && (file->f_mode & FMODE_WRITE)) error = dquot_initialize(inode); return error; } EXPORT_SYMBOL(dquot_file_open); static void vfs_cleanup_quota_inode(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode = dqopt->files[type]; if (!inode) return; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { inode_lock(inode); inode->i_flags &= ~S_NOQUOTA; inode_unlock(inode); } dqopt->files[type] = NULL; iput(inode); } /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { int cnt; struct quota_info *dqopt = sb_dqopt(sb); rwsem_assert_held_write(&sb->s_umount); /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | DQUOT_USAGE_ENABLED))) return -EINVAL; /* * Skip everything if there's nothing to do. We have to do this because * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ if (!sb_any_quota_loaded(sb)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) continue; if (flags & DQUOT_SUSPENDED) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); } else { spin_lock(&dq_state_lock); dqopt->flags &= ~dquot_state_flag(flags, cnt); /* Turning off suspended quotas? */ if (!sb_has_quota_loaded(sb, cnt) && sb_has_quota_suspended(sb, cnt)) { dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); } /* We still have to keep quota loaded? */ if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) continue; /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); /* * Now all dquots should be invalidated, all writes done so we * should be only users of the info. No locks needed. */ if (info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } /* Skip syncing and setting flags if quota files are hidden */ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) goto put_inodes; /* Sync the superblock so that buffers with quota data are written to * disk (and so userspace sees correct data afterwards). */ if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); sync_blockdev(sb->s_bdev); /* Now the quota files are just ordinary files and we can set the * inode flags back. Moreover we discard the pagecache so that * userspace sees the writes we did bypassing the pagecache. We * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: /* We are done when suspending quotas */ if (flags & DQUOT_SUSPENDED) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt)) vfs_cleanup_quota_inode(sb, cnt); return 0; } EXPORT_SYMBOL(dquot_disable); int dquot_quota_off(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); } EXPORT_SYMBOL(dquot_quota_off); /* * Turn quotas on on a device */ static int vfs_setup_quota_inode(struct inode *inode, int type) { struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); if (is_bad_inode(inode)) return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) return -EROFS; if (sb_has_quota_loaded(sb, type)) return -EBUSY; /* * Quota files should never be encrypted. They should be thought of as * filesystem metadata, not user data. New-style internal quota files * cannot be encrypted by users anyway, but old-style external quota * files could potentially be incorrectly created in an encrypted * directory, hence this explicit check. Some reasons why encrypted * quota files don't work include: (1) some filesystems that support * encryption don't handle it in their quota_read and quota_write, and * (2) cleaning up encrypted quota files at unmount would need special * consideration, as quota files are cleaned up later than user files. */ if (IS_ENCRYPTED(inode)) return -EINVAL; dqopt->files[type] = igrab(inode); if (!dqopt->files[type]) return -EIO; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added */ __dquot_drop(inode); } return 0; } int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt; struct quota_info *dqopt = sb_dqopt(sb); int error; lockdep_assert_held_write(&sb->s_umount); /* Just unsuspend quotas? */ if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED)) return -EINVAL; fmt = find_quota_format(format_id); if (!fmt) return -ESRCH; if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; } /* Filesystems outside of init_user_ns not yet supported */ if (sb->s_user_ns != &init_user_ns) { error = -EINVAL; goto out_fmt; } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; goto out_fmt; } if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_fmt; } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* As we bypass the pagecache we must now flush all the * dirty data and invalidate caches so that kernel sees * changes from userspace. It is not enough to just flush * the quota file since if blocksize < pagesize, invalidation * of the cache could fail because of other unrelated dirty * data */ sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; spin_unlock(&dq_data_lock); } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); error = add_dquot_ref(sb, type); if (error) dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; out_fmt: put_quota_format(fmt); return error; } EXPORT_SYMBOL(dquot_load_quota_sb); /* * More powerful function for turning on quotas on given quota inode allowing * setting of individual quota flags */ int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags) { int err; err = vfs_setup_quota_inode(inode, type); if (err < 0) return err; err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); if (err < 0) vfs_cleanup_quota_inode(inode->i_sb, type); return err; } EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int ret = 0, cnt; unsigned int flags; rwsem_assert_held_write(&sb->s_umount); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_suspended(sb, cnt)) continue; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, cnt); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, flags); if (ret < 0) vfs_cleanup_quota_inode(sb, cnt); } return ret; } EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int error = security_quota_on(path->dentry); if (error) return error; /* Quota file not on the same filesystem? */ if (path->dentry->d_sb != sb) error = -EXDEV; else error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; } EXPORT_SYMBOL(dquot_quota_on); /* * This function is used when filesystem needs to initialize quotas * during mount time. */ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { struct dentry *dentry; int error; dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_quota_on(dentry); if (!error) error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); dput(dentry); return error; } EXPORT_SYMBOL(dquot_quota_on_mount); static int dquot_quota_enable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* Accounting cannot be turned on while fs is mounted */ flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT); if (!flags) return -EINVAL; for (type = 0; type < MAXQUOTAS; type++) { if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ if (!sb_has_quota_usage_enabled(sb, type)) { ret = -EINVAL; goto out_err; } if (sb_has_quota_limits_enabled(sb, type)) { /* compatible with XFS */ ret = -EEXIST; goto out_err; } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } return 0; out_err: /* Backout enforcement enablement we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); } return ret; } static int dquot_quota_disable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* * We don't support turning off accounting via quotactl. In principle * quota infrastructure can do this but filesystems don't expect * userspace to be able to do it. */ if (flags & (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT)) return -EOPNOTSUPP; /* Filter out limits not enabled */ for (type = 0; type < MAXQUOTAS; type++) if (!sb_has_quota_limits_enabled(sb, type)) flags &= ~qtype_enforce_flag(type); /* Nothing left? */ if (!flags) return -EEXIST; for (type = 0; type < MAXQUOTAS; type++) { if (flags & qtype_enforce_flag(type)) { ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); if (ret < 0) goto out_err; } } return 0; out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } } return ret; } /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); spin_lock(&dquot->dq_dqb_lock); di->d_spc_hardlimit = dm->dqb_bhardlimit; di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; di->d_ino_softlimit = dm->dqb_isoftlimit; di->d_space = dm->dqb_curspace + dm->dqb_rsvspace; di->d_ino_count = dm->dqb_curinodes; di->d_spc_timer = dm->dqb_btime; di->d_ino_timer = dm->dqb_itime; spin_unlock(&dquot->dq_dqb_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_dqblk); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid, struct qc_dqblk *di) { struct dquot *dquot; int err; if (!sb->dq_op->get_next_id) return -ENOSYS; err = sb->dq_op->get_next_id(sb, qid); if (err < 0) return err; dquot = dqget(sb, *qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_next_dqblk); #define VFS_QC_MASK \ (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ QC_SPC_TIMER | QC_INO_TIMER) /* Generic routine for setting common part of quota structure */ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; int ret; if (di->d_fieldmask & ~VFS_QC_MASK) return -EINVAL; if (((di->d_fieldmask & QC_SPC_SOFT) && di->d_spc_softlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_SPC_HARD) && di->d_spc_hardlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_INO_SOFT) && (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) || ((di->d_fieldmask & QC_INO_HARD) && (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; spin_lock(&dquot->dq_dqb_lock); if (di->d_fieldmask & QC_SPACE) { dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_SOFT) dm->dqb_bsoftlimit = di->d_spc_softlimit; if (di->d_fieldmask & QC_SPC_HARD) dm->dqb_bhardlimit = di->d_spc_hardlimit; if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) { check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_COUNT) { dm->dqb_curinodes = di->d_ino_count; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_SOFT) dm->dqb_isoftlimit = di->d_ino_softlimit; if (di->d_fieldmask & QC_INO_HARD) dm->dqb_ihardlimit = di->d_ino_hardlimit; if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) { check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_TIMER) { dm->dqb_btime = di->d_spc_timer; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_TIMER) { dm->dqb_itime = di->d_ino_timer; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } if (check_blim) { if (!dm->dqb_bsoftlimit || dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace; } if (check_ilim) { if (!dm->dqb_isoftlimit || dm->dqb_curinodes <= dm->dqb_isoftlimit) { dm->dqb_itime = 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_INO_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace; } if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) clear_bit(DQ_FAKE_B, &dquot->dq_flags); else set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); ret = mark_dquot_dirty(dquot); if (ret < 0) return ret; return 0; } int dquot_set_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; int rc; dquot = dqget(sb, qid); if (IS_ERR(dquot)) { rc = PTR_ERR(dquot); goto out; } rc = do_set_dqblk(dquot, di); dqput(dquot); out: return rc; } EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ int dquot_get_state(struct super_block *sb, struct qc_state *state) { struct mem_dqinfo *mi; struct qc_type_state *tstate; struct quota_info *dqopt = sb_dqopt(sb); int type; memset(state, 0, sizeof(*state)); for (type = 0; type < MAXQUOTAS; type++) { if (!sb_has_quota_active(sb, type)) continue; tstate = state->s_state + type; mi = sb_dqopt(sb)->info + type; tstate->flags = QCI_ACCT_ENABLED; spin_lock(&dq_data_lock); if (mi->dqi_flags & DQF_SYS_FILE) tstate->flags |= QCI_SYSFILE; if (mi->dqi_flags & DQF_ROOT_SQUASH) tstate->flags |= QCI_ROOT_SQUASH; if (sb_has_quota_limits_enabled(sb, type)) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; if (dqopt->files[type]) { tstate->ino = dqopt->files[type]->i_ino; tstate->blocks = dqopt->files[type]->i_blocks; } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } return 0; } EXPORT_SYMBOL(dquot_get_state); /* Generic routine for setting common part of quota file information */ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) { struct mem_dqinfo *mi; if ((ii->i_fieldmask & QC_WARNS_MASK) || (ii->i_fieldmask & QC_RT_SPC_TIMER)) return -EINVAL; if (!sb_has_quota_active(sb, type)) return -ESRCH; mi = sb_dqopt(sb)->info + type; if (ii->i_fieldmask & QC_FLAGS) { if ((ii->i_flags & QCI_ROOT_SQUASH && mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) return -EINVAL; } spin_lock(&dq_data_lock); if (ii->i_fieldmask & QC_SPC_TIMER) mi->dqi_bgrace = ii->i_spc_timelimit; if (ii->i_fieldmask & QC_INO_TIMER) mi->dqi_igrace = ii->i_ino_timelimit; if (ii->i_fieldmask & QC_FLAGS) { if (ii->i_flags & QCI_ROOT_SQUASH) mi->dqi_flags |= DQF_ROOT_SQUASH; else mi->dqi_flags &= ~DQF_ROOT_SQUASH; } spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ return sb->dq_op->write_info(sb, type); } EXPORT_SYMBOL(dquot_set_dqinfo); const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .get_nextdqblk = dquot_get_next_dqblk, .set_dqblk = dquot_set_dqblk }; EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); /* Filter negative values for non-monotonic counters */ if (value < 0 && (type == DQST_ALLOC_DQUOTS || type == DQST_FREE_DQUOTS)) value = 0; /* Update global table */ dqstats.stat[type] = value; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "drops", .data = &dqstats.stat[DQST_DROPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "reads", .data = &dqstats.stat[DQST_READS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "writes", .data = &dqstats.stat[DQST_WRITES], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", .data = &dqstats.stat[DQST_CACHE_HITS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", .data = &dqstats.stat[DQST_ALLOC_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", .data = &dqstats.stat[DQST_FREE_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "syncs", .data = &dqstats.stat[DQST_SYNCS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, #ifdef CONFIG_PRINT_QUOTA_WARNING { .procname = "warnings", .data = &flag_print_warnings, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); register_sysctl_init("fs/quota", fs_dqstats_table); dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_PANIC), NULL); order = 0; dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order); if (!dquot_hash) panic("Cannot create dquot hash table"); ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL, _DQST_DQSTAT_LAST); if (ret) panic("Cannot create dquot stat counters"); /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); dq_hash_bits = ilog2(nr_hash); nr_hash = 1UL << dq_hash_bits; dq_hash_mask = nr_hash - 1; for (i = 0; i < nr_hash; i++) INIT_HLIST_HEAD(dquot_hash + i); pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); if (!dqcache_shrinker) panic("Cannot allocate dquot shrinker"); dqcache_shrinker->count_objects = dqcache_shrink_count; dqcache_shrinker->scan_objects = dqcache_shrink_scan; shrinker_register(dqcache_shrinker); return 0; } fs_initcall(dquot_init);
293 295 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpuobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *cgroup; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref tag; #endif }; #if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING) #define NEED_PCPUOBJ_EXT #endif struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef NEED_PCPUOBJ_EXT struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; static inline bool need_pcpuobj_ext(void) { if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) return true; if (!mem_cgroup_kmem_disabled()) return true; return false; } extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif
51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H #include <linux/types.h> /* 31 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 31 struct folio; /** * struct folio_batch - A collection of folios. * * The folio_batch is used to amortise the cost of retrieving and * operating on a set of folios. The order of folios in the batch may be * significant (eg delete_from_page_cache_batch()). Some users of the * folio_batch store "exceptional" entries in it which can be removed * by calling folio_batch_remove_exceptionals(). */ struct folio_batch { unsigned char nr; unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. * * A freshly initialised folio_batch contains zero folios. */ static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; fbatch->i = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; } static inline unsigned int folio_batch_space(struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } /** * folio_batch_add() - Add a folio to a batch. * @fbatch: The folio batch. * @folio: The folio to add. * * The folio is added to the end of the batch. * The batch must have previously been initialised using folio_batch_init(). * * Return: The number of slots still available. */ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; return folio_batch_space(fbatch); } /** * folio_batch_next - Return the next folio to process. * @fbatch: The folio batch being processed. * * Use this function to implement a queue of folios. * * Return: The next folio in the queue, or NULL if the queue is empty. */ static inline struct folio *folio_batch_next(struct folio_batch *fbatch) { if (fbatch->i == fbatch->nr) return NULL; return fbatch->folios[fbatch->i++]; } void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { if (folio_batch_count(fbatch)) __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); #endif /* _LINUX_PAGEVEC_H */
1040 1039 65 1039 16 16 11 16 16 16 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
152 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/ptrace.h * * Copyright (C) 1996-2003 Russell King * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_PTRACE_H #define __ASM_PTRACE_H #include <asm/cpufeature.h> #include <uapi/asm/ptrace.h> /* Current Exception Level values, as contained in CurrentEL */ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) #define INIT_PSTATE_EL1 \ (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h) #define INIT_PSTATE_EL2 \ (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL2h) #include <linux/irqchip/arm-gic-v3-prio.h> #define GIC_PRIO_IRQON GICV3_PRIO_UNMASKED #define GIC_PRIO_IRQOFF GICV3_PRIO_IRQ #define GIC_PRIO_PSR_I_SET GICV3_PRIO_PSR_I_SET /* Additional SPSR bits not exposed in the UABI */ #define PSR_MODE_THREAD_BIT (1 << 0) #define PSR_IL_BIT (1 << 20) /* AArch32-specific ptrace requests */ #define COMPAT_PTRACE_GETREGS 12 #define COMPAT_PTRACE_SETREGS 13 #define COMPAT_PTRACE_GET_THREAD_AREA 22 #define COMPAT_PTRACE_SET_SYSCALL 23 #define COMPAT_PTRACE_GETVFPREGS 27 #define COMPAT_PTRACE_SETVFPREGS 28 #define COMPAT_PTRACE_GETHBPREGS 29 #define COMPAT_PTRACE_SETHBPREGS 30 /* SPSR_ELx bits for exceptions taken from AArch32 */ #define PSR_AA32_MODE_MASK 0x0000001f #define PSR_AA32_MODE_USR 0x00000010 #define PSR_AA32_MODE_FIQ 0x00000011 #define PSR_AA32_MODE_IRQ 0x00000012 #define PSR_AA32_MODE_SVC 0x00000013 #define PSR_AA32_MODE_ABT 0x00000017 #define PSR_AA32_MODE_HYP 0x0000001a #define PSR_AA32_MODE_UND 0x0000001b #define PSR_AA32_MODE_SYS 0x0000001f #define PSR_AA32_T_BIT 0x00000020 #define PSR_AA32_F_BIT 0x00000040 #define PSR_AA32_I_BIT 0x00000080 #define PSR_AA32_A_BIT 0x00000100 #define PSR_AA32_E_BIT 0x00000200 #define PSR_AA32_PAN_BIT 0x00400000 #define PSR_AA32_SSBS_BIT 0x00800000 #define PSR_AA32_DIT_BIT 0x01000000 #define PSR_AA32_Q_BIT 0x08000000 #define PSR_AA32_V_BIT 0x10000000 #define PSR_AA32_C_BIT 0x20000000 #define PSR_AA32_Z_BIT 0x40000000 #define PSR_AA32_N_BIT 0x80000000 #define PSR_AA32_IT_MASK 0x0600fc00 /* If-Then execution state mask */ #define PSR_AA32_GE_MASK 0x000f0000 #ifdef CONFIG_CPU_BIG_ENDIAN #define PSR_AA32_ENDSTATE PSR_AA32_E_BIT #else #define PSR_AA32_ENDSTATE 0 #endif /* AArch32 CPSR bits, as seen in AArch32 */ #define COMPAT_PSR_DIT_BIT 0x00200000 /* * These are 'magic' values for PTRACE_PEEKUSR that return info about where a * process is located in memory. */ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 /* * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing * a syscall -- i.e., its most recent entry into the kernel from * userspace was not via SVC, or otherwise a tracer cancelled the syscall. * * This must have the value -1, for ABI compatibility with ptrace etc. */ #define NO_SYSCALL (-1) #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <asm/stacktrace/frame.h> /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 /* Architecturally defined mapping between AArch32 and AArch64 registers */ #define compat_usr(x) regs[(x)] #define compat_fp regs[11] #define compat_sp regs[13] #define compat_lr regs[14] #define compat_sp_hyp regs[15] #define compat_lr_irq regs[16] #define compat_sp_irq regs[17] #define compat_lr_svc regs[18] #define compat_sp_svc regs[19] #define compat_lr_abt regs[20] #define compat_sp_abt regs[21] #define compat_lr_und regs[22] #define compat_sp_und regs[23] #define compat_r8_fiq regs[24] #define compat_r9_fiq regs[25] #define compat_r10_fiq regs[26] #define compat_r11_fiq regs[27] #define compat_r12_fiq regs[28] #define compat_sp_fiq regs[29] #define compat_lr_fiq regs[30] static inline unsigned long compat_psr_to_pstate(const unsigned long psr) { unsigned long pstate; pstate = psr & ~COMPAT_PSR_DIT_BIT; if (psr & COMPAT_PSR_DIT_BIT) pstate |= PSR_AA32_DIT_BIT; return pstate; } static inline unsigned long pstate_to_compat_psr(const unsigned long pstate) { unsigned long psr; psr = pstate & ~PSR_AA32_DIT_BIT; if (pstate & PSR_AA32_DIT_BIT) psr |= COMPAT_PSR_DIT_BIT; return psr; } /* * This struct defines the way the registers are stored on the stack during an * exception. struct user_pt_regs must form a prefix of struct pt_regs. */ struct pt_regs { union { struct user_pt_regs user_regs; struct { u64 regs[31]; u64 sp; u64 pc; u64 pstate; }; }; u64 orig_x0; s32 syscallno; u32 pmr; u64 sdei_ttbr1; struct frame_record_meta stackframe; /* Only valid for some EL1 exceptions. */ u64 lockdep_hardirqs; u64 exit_rcu; }; /* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */ static_assert(IS_ALIGNED(sizeof(struct pt_regs), 16)); static inline bool in_syscall(struct pt_regs const *regs) { return regs->syscallno != NO_SYSCALL; } static inline void forget_syscall(struct pt_regs *regs) { regs->syscallno = NO_SYSCALL; } #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) #define arch_has_single_step() (1) #ifdef CONFIG_COMPAT #define compat_thumb_mode(regs) \ (((regs)->pstate & PSR_AA32_T_BIT)) #else #define compat_thumb_mode(regs) (0) #endif #define user_mode(regs) \ (((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t) #define compat_user_mode(regs) \ (((regs)->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) == \ (PSR_MODE32_BIT | PSR_MODE_EL0t)) #define processor_mode(regs) \ ((regs)->pstate & PSR_MODE_MASK) #define irqs_priority_unmasked(regs) \ (system_uses_irq_prio_masking() ? \ (regs)->pmr == GIC_PRIO_IRQON : \ true) #define interrupts_enabled(regs) \ (!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs)) #define fast_interrupts_enabled(regs) \ (!((regs)->pstate & PSR_F_BIT)) static inline unsigned long user_stack_pointer(struct pt_regs *regs) { if (compat_user_mode(regs)) return regs->compat_sp; return regs->sp; } extern int regs_query_register_offset(const char *name); extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); /** * regs_get_register() - get register value from its offset * @regs: pt_regs from which register value is gotten * @offset: offset of the register. * * regs_get_register returns the value of a register whose offset from @regs. * The @offset is the offset of the register in struct pt_regs. * If @offset is bigger than MAX_REG_OFFSET, this returns 0. */ static inline u64 regs_get_register(struct pt_regs *regs, unsigned int offset) { u64 val = 0; WARN_ON(offset & 7); offset >>= 3; switch (offset) { case 0 ... 30: val = regs->regs[offset]; break; case offsetof(struct pt_regs, sp) >> 3: val = regs->sp; break; case offsetof(struct pt_regs, pc) >> 3: val = regs->pc; break; case offsetof(struct pt_regs, pstate) >> 3: val = regs->pstate; break; default: val = 0; } return val; } /* * Read a register given an architectural register index r. * This handles the common case where 31 means XZR, not SP. */ static inline unsigned long pt_regs_read_reg(const struct pt_regs *regs, int r) { return (r == 31) ? 0 : regs->regs[r]; } /* * Write a register given an architectural register index r. * This handles the common case where 31 means XZR, not SP. */ static inline void pt_regs_write_reg(struct pt_regs *regs, int r, unsigned long val) { if (r != 31) regs->regs[r] = val; } /* Valid only for Kernel mode traps. */ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } static inline unsigned long regs_return_value(struct pt_regs *regs) { unsigned long val = regs->regs[0]; /* * Audit currently uses regs_return_value() instead of * syscall_get_return_value(). Apply the same sign-extension here until * audit is updated to use syscall_get_return_value(). */ if (compat_user_mode(regs)) val = sign_extend64(val, 31); return val; } static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->regs[0] = rc; } /** * regs_get_kernel_argument() - get Nth function argument in kernel * @regs: pt_regs of that context * @n: function argument number (start from 0) * * regs_get_argument() returns @n th argument of the function call. * * Note that this chooses the most likely register mapping. In very rare * cases this may not return correct data, for example, if one of the * function parameters is 16 bytes or bigger. In such cases, we cannot * get access the parameter correctly and the register assignment of * subsequent parameters will be shifted. */ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsigned int n) { #define NR_REG_ARGUMENTS 8 if (n < NR_REG_ARGUMENTS) return pt_regs_read_reg(regs, n); return 0; } /* We must avoid circular header include via sched.h */ struct task_struct; int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); static inline unsigned long instruction_pointer(struct pt_regs *regs) { return regs->pc; } static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { regs->pc = val; } static inline unsigned long frame_pointer(struct pt_regs *regs) { return regs->regs[29]; } #define procedure_link_pointer(regs) ((regs)->regs[30]) static inline void procedure_link_pointer_set(struct pt_regs *regs, unsigned long val) { procedure_link_pointer(regs) = val; } extern unsigned long profile_pc(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP protocol. * * Version: @(#)ip.h 1.0.2 04/28/93 * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IP_H #define _LINUX_IP_H #include <linux/skbuff.h> #include <uapi/linux/ip.h> static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); } static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_inner_network_header(skb); } static inline struct iphdr *ipip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_transport_header(skb); } static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) { u32 len = ntohs(iph->tot_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb); } static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) { return iph_totlen(skb, ip_hdr(skb)); } /* IPv4 datagram length is stored into 16bit field (tot_len) */ #define IP_MAX_MTU 0xFFFFU static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) { iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; } #endif /* _LINUX_IP_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0 #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> #include <net/busy_poll.h> #include <net/net_debug.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/types.h> #include <net/page_pool/memory_provider.h> #include <net/sock.h> #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); /* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, * pool->user. * Ordering: inside rtnl_lock */ DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" * states are possible: * - normal * - user.list: linked to real netdev, netdev: real netdev * - orphaned - real netdev has disappeared * - user.list: linked to lo, netdev: lo * - invisible - either (a) created without netdev linking, (b) unlisted due * to error, or (c) the entire namespace which owned this pool disappeared * - user.list: unhashed, netdev: unknown */ typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info); static int netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) { struct page_pool *pool; struct sk_buff *rsp; int err; mutex_lock(&page_pools_lock); pool = xa_load(&page_pools, id); if (!pool || hlist_unhashed(&pool->user.list) || !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { err = -ENOENT; goto err_unlock; } rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) { err = -ENOMEM; goto err_unlock; } err = fill(rsp, pool, info); if (err) goto err_free_msg; mutex_unlock(&page_pools_lock); return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); err_unlock: mutex_unlock(&page_pools_lock); return err; } struct page_pool_dump_cb { unsigned long ifindex; u32 pp_id; }; static int netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, pp_nl_fill_cb fill) { struct page_pool_dump_cb *state = (void *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; struct page_pool *pool; int err = 0; rtnl_lock(); mutex_lock(&page_pools_lock); for_each_netdev_dump(net, netdev, state->ifindex) { hlist_for_each_entry(pool, &netdev->page_pools, user.list) { if (state->pp_id && state->pp_id < pool->user.id) continue; state->pp_id = pool->user.id; err = fill(skb, pool, info); if (err) goto out; } state->pp_id = 0; } out: mutex_unlock(&page_pools_lock); rtnl_unlock(); return err; } static int page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { #ifdef CONFIG_PAGE_POOL_STATS struct page_pool_stats stats = {}; struct nlattr *nest; void *hdr; if (!page_pool_get_stats(pool, &stats)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex))) goto err_cancel_nest; nla_nest_end(rsp, nest); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, stats.alloc_stats.fast) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, stats.alloc_stats.slow) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, stats.alloc_stats.slow_high_order) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, stats.alloc_stats.empty) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, stats.alloc_stats.refill) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, stats.alloc_stats.waive) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, stats.recycle_stats.cached) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, stats.recycle_stats.cache_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, stats.recycle_stats.ring) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, stats.recycle_stats.ring_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, stats.recycle_stats.released_refcnt)) goto err_cancel_msg; genlmsg_end(rsp, hdr); return 0; err_cancel_nest: nla_nest_cancel(rsp, nest); err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; #else GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); return -EOPNOTSUPP; #endif } int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; struct nlattr *nest; int err; u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) return -EINVAL; nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, netdev_page_pool_info_nl_policy, info->extack); if (err) return err; if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NETDEV_A_PAGE_POOL_IFINDEX], "selecting by ifindex not supported"); return -EINVAL; } id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); } int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); } static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { size_t inflight, refsz; unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) goto err_cancel; if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; if (napi_id_valid(napi_id) && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); refsz = PAGE_SIZE << pool->p.order; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, inflight * refsz)) goto err_cancel; if (pool->user.detach_time && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, pool->user.detach_time)) goto err_cancel; if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); return 0; err_cancel: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) { struct genl_info info; struct sk_buff *ntf; struct net *net; lockdep_assert_held(&page_pools_lock); /* 'invisible' page pools don't matter */ if (hlist_unhashed(&pool->user.list)) return; net = dev_net(pool->slow.netdev); if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (page_pool_nl_fill(ntf, pool, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, net, ntf, 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); } int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); } int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); } int page_pool_list(struct page_pool *pool) { static u32 id_alloc_next; int err; mutex_lock(&page_pools_lock); err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, &id_alloc_next, GFP_KERNEL); if (err < 0) goto err_unlock; INIT_HLIST_NODE(&pool->user.list); if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } mutex_unlock(&page_pools_lock); return 0; err_unlock: mutex_unlock(&page_pools_lock); return err; } void page_pool_detached(struct page_pool *pool) { mutex_lock(&page_pools_lock); pool->user.detach_time = ktime_get_boottime_seconds(); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); mutex_unlock(&page_pools_lock); } void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); xa_erase(&page_pools, pool->user.id); if (!hlist_unhashed(&pool->user.list)) hlist_del(&pool->user.list); mutex_unlock(&page_pools_lock); } int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; if (!binding) return 0; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { if (pool->mp_priv != binding) continue; if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { mutex_unlock(&page_pools_lock); return 0; } } mutex_unlock(&page_pools_lock); return -ENODATA; } static void page_pool_unreg_netdev_wipe(struct net_device *netdev) { struct page_pool *pool; struct hlist_node *n; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { hlist_del_init(&pool->user.list); pool->slow.netdev = NET_PTR_POISON; } mutex_unlock(&page_pools_lock); } static void page_pool_unreg_netdev(struct net_device *netdev) { struct page_pool *pool, *last; struct net_device *lo; lo = dev_net(netdev)->loopback_dev; mutex_lock(&page_pools_lock); last = NULL; hlist_for_each_entry(pool, &netdev->page_pools, user.list) { pool->slow.netdev = lo; netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); last = pool; } if (last) hlist_splice_init(&netdev->page_pools, &last->user.list, &lo->page_pools); mutex_unlock(&page_pools_lock); } static int page_pool_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; if (hlist_empty(&netdev->page_pools)) return NOTIFY_OK; if (netdev->ifindex != LOOPBACK_IFINDEX) page_pool_unreg_netdev(netdev); else page_pool_unreg_netdev_wipe(netdev); return NOTIFY_OK; } static struct notifier_block page_pool_netdevice_nb = { .notifier_call = page_pool_netdevice_event, }; static int __init page_pool_user_init(void) { return register_netdevice_notifier(&page_pool_netdevice_nb); } subsys_initcall(page_pool_user_init);
398 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <uapi/linux/mempolicy.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be * applied to the leaves of pgtables. */ #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif /* * Page migration support. * * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and * indicates that the referenced (part of) an anonymous page is exclusive to * a single process. For SWP_MIGRATION_WRITE, that information is implicit: * (part of) an anonymous page that are mapped writable are exclusive to a * single process. */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 3 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; #ifdef CONFIG_LRU_GEN /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif }; /* * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based * reclaim * @pages: number of pages reclaimed * * If the current process is undergoing a reclaim operation, increment the * number of reclaimed pages by @pages. */ static inline void mm_account_reclaimed_pages(unsigned long pages) { if (current->reclaim_state) current->reclaim_state->reclaimed += pages; } #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ }; #define SWAP_CLUSTER_MAX 32UL #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The flags field determines if a cluster is free. This is * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * other than list, and swap_info_struct->swap_map * elements corresponding to the swap cluster. */ u16 count; u8 flags; u8 order; struct list_head list; }; /* All on-list cluster must have a non-zero flag. */ enum swap_cluster_flags { CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ CLUSTER_FLAG_FREE, CLUSTER_FLAG_NONFULL, CLUSTER_FLAG_FRAG, /* Clusters with flags above are allocatable */ CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, CLUSTER_FLAG_FULL, CLUSTER_FLAG_DISCARD, CLUSTER_FLAG_MAX, }; /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as * a sentinel to indicate an entry is not valid. */ #define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) #else #define SWAP_NR_ORDERS 1 #endif /* * We keep using same cluster for rotational device so IO will be sequential. * The purpose is to optimize SWAP throughput on these device. */ struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, * cluster_nr, lowest_alloc, * highest_alloc, free/discard cluster * list. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. * Must be last as the number of the * array is nr_node_ids, which is not * a fixed value so have to allocate * dynamically. * And it has to be an array so that * plist_for_each_* can work. */ }; static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated); void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) { return atomic_read(&lru_disable_count); } static inline void lru_cache_enable(void) { atomic_dec(&lru_disable_count); } extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 /* Just recliam from anon folios in proactive memory reclaim */ #define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options, int *swappiness); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #endif void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } void free_swap_cache(struct folio *folio); void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; } static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; } static inline void put_swap_device(struct swap_info_struct *si) { } #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) #define free_folio_and_swap_cache(folio) \ folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) { } static inline void free_swap_cache(struct folio *folio) { } static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } static inline int swap_duplicate(swp_entry_t swp) { return 0; } static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { return false; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) { return -EINVAL; } static inline bool folio_free_swap(struct folio *folio) { return false; } static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { return -EINVAL; } #endif /* CONFIG_SWAP */ static inline void free_swap_and_cache(swp_entry_t entry) { free_swap_and_cache_nr(entry, 1); } static inline void swap_free(swp_entry_t entry) { swap_free_nr(entry, 1); } #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return READ_ONCE(vm_swappiness); /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return READ_ONCE(vm_swappiness); return READ_ONCE(memcg->swappiness); } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return READ_ONCE(vm_swappiness); } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { if (mem_cgroup_disabled()) return; __folio_throttle_swaprate(folio, gfp); } #else static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { } #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_swap(entry, nr_pages); } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ static inline void cpu_relax(void) { asm volatile("yield" ::: "memory"); } #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ #define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __always_inline unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } #endif
1 1 152 152 8 54 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * This function is intended for architectures that need * anything beyond simple page allocation. * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) { return __pte_alloc_one_kernel_noprof(mm); } #define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__)) #endif /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context * @pte: pointer to the memory containing the page table */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_dtor_free(virt_to_ptdesc(pte)); } /** * __pte_alloc_one - allocate memory for a PTE-level user page table * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_page(ptdesc); } #define __pte_alloc_one(...) alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm) { return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER); } #define pte_alloc_one(...) alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__)) #endif /* * Should really implement gc for free page table pages. This could be * done with a reference count in struct page. */ /** * pte_free - free PTE-level user page table memory * @mm: the mm_struct of the current context * @pte_page: the `struct page` referencing the ptdesc */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); pagetable_dtor_free(ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 #ifndef __HAVE_ARCH_PMD_ALLOC_ONE /** * pmd_alloc_one - allocate memory for a PMD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor(). * * Allocations use %GFP_PGTABLE_USER in user context and * %GFP_PGTABLE_KERNEL in kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) #endif #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate memory for a PUD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table using %GFP_PGTABLE_USER for user context * and %GFP_PGTABLE_KERNEL for kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one_noprof(mm, addr); } #define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE static inline void pud_free(struct mm_struct *mm, pud_t *pud) { __pud_free(mm, pud); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #if CONFIG_PGTABLE_LEVELS > 4 static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; pagetable_p4d_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) #ifndef __HAVE_ARCH_P4D_ALLOC_ONE static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) { return __p4d_alloc_one_noprof(mm, addr); } #define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) #endif static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) { struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_P4D_FREE static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!mm_p4d_folded(mm)) __p4d_free(mm, p4d); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 4 */ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, order); if (!ptdesc) return NULL; pagetable_pgd_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } #endif #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */
12 12 170 173 453 452 451 1 1 170 321 172 173 173 38 159 172 318 318 318 318 318 174 295 172 319 26 319 453 450 2 452 453 172 296 453 177 314 172 320 453 452 453 452 453 174 295 649 647 649 648 39 647 531 452 4 4 26 26 26 26 26 26 26 26 26 26 26 25 26 26 26 26 605 4 598 4 605 606 4 353 4 572 388 212 605 605 605 239 419 419 419 419 392 347 415 4 420 291 388 536 600 603 601 603 603 601 602 603 598 603 2 491 491 286 1 203 348 349 11 345 353 353 353 284 15 353 348 347 348 347 348 348 347 348 347 349 349 348 349 5 349 349 1 349 349 349 348 349 349 349 349 1 348 347 347 347 348 348 348 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ static int sysctl_vfs_cache_pressure __read_mostly = 100; static int sysctl_vfs_cache_pressure_denom __read_mostly = 100; unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom); } EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static const struct ctl_table vm_dcache_sysctls[] = { { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "vfs_cache_pressure_denom", .data = &sysctl_vfs_cache_pressure_denom, .maxlen = sizeof(sysctl_vfs_cache_pressure_denom), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE_HUNDRED, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } /* * long names are allocated separately from dentry and never modified. * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() * for the reason why ->count and ->head can't be combined into a union. * dentry_string_cmp() relies upon ->name[] being word-aligned. */ struct external_name { atomic_t count; struct rcu_head head; unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { unsigned seq; const unsigned char *s; rcu_read_lock(); retry: seq = read_seqcount_begin(&dentry->d_seq); s = READ_ONCE(dentry->d_name.name); name->name.hash_len = dentry->d_name.hash_len; name->name.name = name->inline_name.string; if (likely(s == dentry->d_shortname.string)) { name->inline_name = dentry->d_shortname; } else { struct external_name *p; p = container_of(s, struct external_name, name[0]); // get a valid reference if (unlikely(!atomic_inc_not_zero(&p->count))) goto retry; name->name.name = s; } if (read_seqcount_retry(&dentry->d_seq, seq)) { release_dentry_name_snapshot(name); goto retry; } rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->count))) kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->count, 1); dname = p->name; } else { dname = dentry->d_shortname.string; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; if (!sb->s_d_op) d_set_d_op(dentry, &anon_ops); } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { if (dentry_negative_policy) __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); if (wq_has_sleeper(d_wait)) wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ dentry->d_name.name = target->d_name.name; target->d_shortname = dentry->d_shortname; target->d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ target->d_name.name = dentry->d_name.name; dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ for (int i = 0; i < DNAME_INLINE_WORDS; i++) swap(dentry->d_shortname.words[i], target->d_shortname.words[i]); } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); dentry->d_name = target->d_name; } else { dentry->d_shortname = target->d_shortname; dentry->d_name.name = dentry->d_shortname.string; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H #include <linux/path.h> #include <linux/spinlock.h> #include <linux/seqlock.h> struct fs_struct { int users; spinlock_t lock; seqcount_spinlock_t seq; int umask; int in_exec; struct path root, pwd; } __randomize_layout; extern struct kmem_cache *fs_cachep; extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, const struct path *); extern void set_fs_pwd(struct fs_struct *, const struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void free_fs_struct(struct fs_struct *); extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { spin_lock(&fs->lock); *root = fs->root; path_get(root); spin_unlock(&fs->lock); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { spin_lock(&fs->lock); *pwd = fs->pwd; path_get(pwd); spin_unlock(&fs->lock); } extern bool current_chrooted(void); #endif /* _LINUX_FS_STRUCT_H */
464 463 465 463 464 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LOCAL_LOCK_H # error "Do not include directly, include linux/local_lock.h" #endif #include <linux/percpu-defs.h> #include <linux/lockdep.h> #ifndef CONFIG_PREEMPT_RT typedef struct { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; struct task_struct *owner; #endif } local_lock_t; /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { local_lock_t llock; u8 acquired; } local_trylock_t; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_trylock_acquire(local_lock_t *l) { lock_map_acquire_try(&l->dep_map); DEBUG_LOCKS_WARN_ON(l->owner); l->owner = current; } static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); l->owner = NULL; lock_map_release(&l->dep_map); } static inline void local_lock_debug_init(local_lock_t *l) { l->owner = NULL; } #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } #define INIT_LOCAL_TRYLOCK(lockname) { LOCAL_TRYLOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_PERCPU); \ local_lock_debug_init(lock); \ } while (0) #define __local_trylock_init(lock) __local_lock_init(lock.llock) #define __spinlock_nested_bh_init(lock) \ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, \ 0, LD_WAIT_CONFIG, LD_WAIT_INV, \ LD_LOCK_NORMAL); \ local_lock_debug_init(lock); \ } while (0) #define __local_lock_acquire(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)this_cpu_ptr(lock); \ tl = (local_trylock_t *)l; \ _Generic((lock), \ __percpu local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 0); \ WRITE_ONCE(tl->acquired, 1); \ }), \ __percpu local_lock_t *: (void)0); \ local_lock_acquire(l); \ } while (0) #define __local_lock(lock) \ do { \ preempt_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ __local_lock_acquire(lock); \ } while (0) #define __local_lock_irqsave(lock, flags) \ do { \ local_irq_save(flags); \ __local_lock_acquire(lock); \ } while (0) #define __local_trylock(lock) \ ({ \ local_trylock_t *tl; \ \ preempt_disable(); \ tl = this_cpu_ptr(lock); \ if (READ_ONCE(tl->acquired)) { \ preempt_enable(); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ local_trylock_t *tl; \ \ local_irq_save(flags); \ tl = this_cpu_ptr(lock); \ if (READ_ONCE(tl->acquired)) { \ local_irq_restore(flags); \ tl = NULL; \ } else { \ WRITE_ONCE(tl->acquired, 1); \ local_trylock_acquire( \ (local_lock_t *)tl); \ } \ !!tl; \ }) #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ local_lock_t *l; \ \ l = (local_lock_t *)this_cpu_ptr(lock); \ tl = (local_trylock_t *)l; \ local_lock_release(l); \ _Generic((lock), \ __percpu local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 1); \ WRITE_ONCE(tl->acquired, 0); \ }), \ __percpu local_lock_t *: (void)0); \ } while (0) #define __local_unlock(lock) \ do { \ __local_lock_release(lock); \ preempt_enable(); \ } while (0) #define __local_unlock_irq(lock) \ do { \ __local_lock_release(lock); \ local_irq_enable(); \ } while (0) #define __local_unlock_irqrestore(lock, flags) \ do { \ __local_lock_release(lock); \ local_irq_restore(flags); \ } while (0) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq(); \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ local_lock_release(this_cpu_ptr(lock)) #else /* !CONFIG_PREEMPT_RT */ /* * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define __local_lock_init(l) \ do { \ local_spin_lock_init((l)); \ } while (0) #define __local_trylock_init(l) __local_lock_init(l) #define __local_lock(__lock) \ do { \ migrate_disable(); \ spin_lock(this_cpu_ptr((__lock))); \ } while (0) #define __local_lock_irq(lock) __local_lock(lock) #define __local_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = 0; \ __local_lock(lock); \ } while (0) #define __local_unlock(__lock) \ do { \ spin_unlock(this_cpu_ptr((__lock))); \ migrate_enable(); \ } while (0) #define __local_unlock_irq(lock) __local_unlock(lock) #define __local_unlock_irqrestore(lock, flags) __local_unlock(lock) #define __local_lock_nested_bh(lock) \ do { \ lockdep_assert_in_softirq_func(); \ spin_lock(this_cpu_ptr(lock)); \ } while (0) #define __local_unlock_nested_bh(lock) \ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) #define __local_trylock(lock) \ ({ \ int __locked; \ \ if (in_nmi() | in_hardirq()) { \ __locked = 0; \ } else { \ migrate_disable(); \ __locked = spin_trylock(this_cpu_ptr((lock))); \ if (!__locked) \ migrate_enable(); \ } \ __locked; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ typecheck(unsigned long, flags); \ flags = 0; \ __local_trylock(lock); \ }) #endif /* CONFIG_PREEMPT_RT */
1526 1551 8 22 22 217 217 1501 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/tomoyo.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/lsm_hooks.h> #include <uapi/linux/lsm.h> #include "common.h" /** * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread. * * Returns pointer to "struct tomoyo_domain_info" for current thread. */ struct tomoyo_domain_info *tomoyo_domain(void) { struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } return s->domain_info; } /** * tomoyo_cred_prepare - Target for security_prepare_creds(). * * @new: Pointer to "struct cred". * @old: Pointer to "struct cred". * @gfp: Memory allocation flags. * * Returns 0. */ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { /* Restore old_domain_info saved by previous execve() request. */ struct tomoyo_task *s = tomoyo_task(current); if (s->old_domain_info && !current->in_execve) { atomic_dec(&s->domain_info->users); s->domain_info = s->old_domain_info; s->old_domain_info = NULL; } return 0; } /** * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds(). * * @bprm: Pointer to "struct linux_binprm". */ static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm) { /* Clear old_domain_info saved by execve() request. */ struct tomoyo_task *s = tomoyo_task(current); atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER /** * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0. */ static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm) { /* * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested * for the first time. */ if (!tomoyo_policy_loaded) tomoyo_load_policy(bprm->filename); return 0; } #endif /** * tomoyo_bprm_check_security - Target for security_bprm_check(). * * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_bprm_check_security(struct linux_binprm *bprm) { struct tomoyo_task *s = tomoyo_task(current); /* * Execute permission is checked against pathname passed to execve() * using current domain. */ if (!s->old_domain_info) { const int idx = tomoyo_read_lock(); const int err = tomoyo_find_next_domain(bprm); tomoyo_read_unlock(idx); return err; } /* * Read permission is checked against interpreters using next domain. */ return tomoyo_check_open_permission(s->domain_info, &bprm->file->f_path, O_RDONLY); } /** * tomoyo_inode_getattr - Target for security_inode_getattr(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inode_getattr(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL); } /** * tomoyo_path_truncate - Target for security_path_truncate(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_truncate(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL); } /** * tomoyo_file_truncate - Target for security_file_truncate(). * * @file: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_truncate(struct file *file) { return tomoyo_path_truncate(&file->f_path); } /** * tomoyo_path_unlink - Target for security_path_unlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL); } /** * tomoyo_path_mkdir - Target for security_path_mkdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path, mode & S_IALLUGO); } /** * tomoyo_path_rmdir - Target for security_path_rmdir(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL); } /** * tomoyo_path_symlink - Target for security_path_symlink(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @old_name: Symlink's content. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry, const char *old_name) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name); } /** * tomoyo_path_mknod - Target for security_path_mknod(). * * @parent: Pointer to "struct path". * @dentry: Pointer to "struct dentry". * @mode: DAC permission mode. * @dev: Device attributes. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry, umode_t mode, unsigned int dev) { struct path path = { .mnt = parent->mnt, .dentry = dentry }; int type = TOMOYO_TYPE_CREATE; const unsigned int perm = mode & S_IALLUGO; switch (mode & S_IFMT) { case S_IFCHR: type = TOMOYO_TYPE_MKCHAR; break; case S_IFBLK: type = TOMOYO_TYPE_MKBLOCK; break; default: goto no_dev; } return tomoyo_mkdev_perm(type, &path, perm, dev); no_dev: switch (mode & S_IFMT) { case S_IFIFO: type = TOMOYO_TYPE_MKFIFO; break; case S_IFSOCK: type = TOMOYO_TYPE_MKSOCK; break; } return tomoyo_path_number_perm(type, &path, perm); } /** * tomoyo_path_link - Target for security_path_link(). * * @old_dentry: Pointer to "struct dentry". * @new_dir: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry }; return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2); } /** * tomoyo_path_rename - Target for security_path_rename(). * * @old_parent: Pointer to "struct path". * @old_dentry: Pointer to "struct dentry". * @new_parent: Pointer to "struct path". * @new_dentry: Pointer to "struct dentry". * @flags: Rename options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_rename(const struct path *old_parent, struct dentry *old_dentry, const struct path *new_parent, struct dentry *new_dentry, const unsigned int flags) { struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry }; struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry }; if (flags & RENAME_EXCHANGE) { const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2, &path1); if (err) return err; } return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2); } /** * tomoyo_file_fcntl - Target for security_file_fcntl(). * * @file: Pointer to "struct file". * @cmd: Command for fcntl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND))) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path, O_WRONLY | (arg & O_APPEND)); } /** * tomoyo_file_open - Target for security_file_open(). * * @f: Pointer to "struct file". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_open(struct file *f) { /* Don't check read permission here if called from execve(). */ /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (f->f_flags & __FMODE_EXEC) return 0; return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, f->f_flags); } /** * tomoyo_file_ioctl - Target for security_file_ioctl(). * * @file: Pointer to "struct file". * @cmd: Command for ioctl(). * @arg: Argument for @cmd. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd); } /** * tomoyo_path_chmod - Target for security_path_chmod(). * * @path: Pointer to "struct path". * @mode: DAC permission mode. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chmod(const struct path *path, umode_t mode) { return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path, mode & S_IALLUGO); } /** * tomoyo_path_chown - Target for security_path_chown(). * * @path: Pointer to "struct path". * @uid: Owner ID. * @gid: Group ID. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { int error = 0; if (uid_valid(uid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path, from_kuid(&init_user_ns, uid)); if (!error && gid_valid(gid)) error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path, from_kgid(&init_user_ns, gid)); return error; } /** * tomoyo_path_chroot - Target for security_path_chroot(). * * @path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_path_chroot(const struct path *path) { return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL); } /** * tomoyo_sb_mount - Target for security_sb_mount(). * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return tomoyo_mount_permission(dev_name, path, type, flags, data); } /** * tomoyo_sb_umount - Target for security_sb_umount(). * * @mnt: Pointer to "struct vfsmount". * @flags: Unmount options. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_umount(struct vfsmount *mnt, int flags) { struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL); } /** * tomoyo_sb_pivotroot - Target for security_sb_pivotroot(). * * @old_path: Pointer to "struct path". * @new_path: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path); } /** * tomoyo_socket_listen - Check permission for listen(). * * @sock: Pointer to "struct socket". * @backlog: Backlog parameter. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_listen(struct socket *sock, int backlog) { return tomoyo_socket_listen_permission(sock); } /** * tomoyo_socket_connect - Check permission for connect(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_connect_permission(sock, addr, addr_len); } /** * tomoyo_socket_bind - Check permission for bind(). * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { return tomoyo_socket_bind_permission(sock, addr, addr_len); } /** * tomoyo_socket_sendmsg - Check permission for sendmsg(). * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Size of message. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return tomoyo_socket_sendmsg_permission(sock, msg, size); } struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { .lbs_task = sizeof(struct tomoyo_task), }; /** * tomoyo_task_alloc - Target for security_task_alloc(). * * @task: Pointer to "struct task_struct". * @clone_flags: clone() flags. * * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, unsigned long clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); new->domain_info = old->domain_info; atomic_inc(&new->domain_info->users); new->old_domain_info = NULL; return 0; } /** * tomoyo_task_free - Target for security_task_free(). * * @task: Pointer to "struct task_struct". */ static void tomoyo_task_free(struct task_struct *task) { struct tomoyo_task *s = tomoyo_task(task); if (s->domain_info) { atomic_dec(&s->domain_info->users); s->domain_info = NULL; } if (s->old_domain_info) { atomic_dec(&s->old_domain_info->users); s->old_domain_info = NULL; } } static const struct lsm_id tomoyo_lsmid = { .name = "tomoyo", .id = LSM_ID_TOMOYO, }; /* tomoyo_hooks is used for registering TOMOYO. */ static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc), LSM_HOOK_INIT(task_free, tomoyo_task_free), #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec), #endif LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security), LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl), LSM_HOOK_INIT(file_open, tomoyo_file_open), LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate), LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate), LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink), LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir), LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir), LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink), LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod), LSM_HOOK_INIT(path_link, tomoyo_path_link), LSM_HOOK_INIT(path_rename, tomoyo_path_rename), LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr), LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl), LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl), LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod), LSM_HOOK_INIT(path_chown, tomoyo_path_chown), LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot), LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount), LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount), LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot), LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind), LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect), LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen), LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg), }; /* Lock for GC. */ DEFINE_SRCU(tomoyo_ss); int tomoyo_enabled __ro_after_init = 1; /** * tomoyo_init - Register TOMOYO Linux as a LSM module. * * Returns 0. */ static int __init tomoyo_init(void) { struct tomoyo_task *s = tomoyo_task(current); /* register ourselves with the security framework */ security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks), &tomoyo_lsmid); pr_info("TOMOYO Linux initialized\n"); s->domain_info = &tomoyo_kernel_domain; atomic_inc(&tomoyo_kernel_domain.users); s->old_domain_info = NULL; tomoyo_mm_init(); return 0; } DEFINE_LSM(tomoyo) = { .name = "tomoyo", .enabled = &tomoyo_enabled, .flags = LSM_FLAG_LEGACY_MAJOR, .blobs = &tomoyo_blob_sizes, .init = tomoyo_init, };
24 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) #define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) /* * The following macros are non-atomic versions of their non-underscored * counterparts. */ #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); __check_bitop_pr(test_bit_acquire); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned int fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * parity8 - get the parity of an u8 value * @value: the value to be examined * * Determine the parity of the u8 argument. * * Returns: * 0 for even parity, 1 for odd parity * * Note: This function informs you about the current parity. Example to bail * out when parity is odd: * * if (parity8(val) == 1) * return -EBADMSG; * * If you need to calculate a parity bit, you need to draw the conclusion from * this result yourself. Example to enforce odd parity, parity bit is bit 7: * * if (parity8(val) == 0) * val ^= BIT(7); */ static inline int parity8(u8 val) { /* * One explanation of this algorithm: * https://funloop.org/codex/problem/parity/README.html */ val ^= val >> 4; return (0x6996 >> (val & 0xf)) & 1; } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned int fns(unsigned long word, unsigned int n) { while (word && n--) word &= word - 1; return word ? __ffs(word) : BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ #define assign_bit(nr, addr, value) \ ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr))) #define __assign_bit(nr, addr, value) \ ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr))) /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif
114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VGIC_H #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm TRACE_EVENT(vgic_update_irq_pending, TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), TP_ARGS(vcpu_id, irq, level), TP_STRUCT__entry( __field( unsigned long, vcpu_id ) __field( __u32, irq ) __field( bool, level ) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->irq = irq; __entry->level = level; ), TP_printk("VCPU: %ld, IRQ %d, level: %d", __entry->vcpu_id, __entry->irq, __entry->level) ); #endif /* _TRACE_VGIC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); struct file *needs_masking; /* * 'mask' is zero for an out-of-bounds fd, all ones for ok. * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. * * Accessing fdt->fd[0] is ok, but needs masking of the result. */ needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) { return test_bit(fd, files_fdtable(files)->close_on_exec); } struct task_struct; void put_files_struct(struct files_struct *fs); int unshare_files(void); struct fd_range { unsigned int from, to; }; struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int close_fd(unsigned int fd); extern struct file *file_close_fd(unsigned int fd); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */
1009 1009 1007 748 606 341 141 141 14 10 141 135 141 584 584 584 315 318 318 282 282 293 293 344 341 344 604 5 575 576 7 84 84 469 470 34 276 276 22 352 353 171 733 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H /* Avoid a dependency loop by declaring here. */ extern int rcuwait_wake_up(struct rcuwait *w); #include <linux/lockdep.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/rwsem.h> #include <linux/tracepoint-defs.h> #include <linux/types.h> #include <linux/cleanup.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), DECLARE_TRACEPOINT(mmap_lock_start_locking); DECLARE_TRACEPOINT(mmap_lock_acquire_returned); DECLARE_TRACEPOINT(mmap_lock_released); #ifdef CONFIG_TRACING void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_start_locking)) __mmap_lock_do_trace_start_locking(mm, write); } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { if (tracepoint_enabled(mmap_lock_acquire_returned)) __mmap_lock_do_trace_acquire_returned(mm, write, success); } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { if (tracepoint_enabled(mmap_lock_released)) __mmap_lock_do_trace_released(mm, write); } #else /* !CONFIG_TRACING */ static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, bool write) { } static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { } static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) { } #endif /* CONFIG_TRACING */ static inline void mmap_assert_locked(const struct mm_struct *mm) { rwsem_assert_held(&mm->mmap_lock); } static inline void mmap_assert_write_locked(const struct mm_struct *mm) { rwsem_assert_held_write(&mm->mmap_lock); } #ifdef CONFIG_PER_VMA_LOCK static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_begin(struct mm_struct *mm) { do_raw_write_seqcount_begin(&mm->mm_lock_seq); } static inline void mm_lock_seqcount_end(struct mm_struct *mm) { ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); do_raw_write_seqcount_end(&mm->mm_lock_seq); } static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { /* * Since mmap_lock is a sleeping lock, and waiting for it to become * unlocked is more or less equivalent with taking it ourselves, don't * bother with the speculative path if mmap_lock is already write-locked * and take the slow path, which takes the lock. */ return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return read_seqcount_retry(&mm->mm_lock_seq, seq); } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key lockdep_key; lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); #endif if (reset_refcnt) refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } static inline bool is_vma_writer_only(int refcnt) { /* * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) { /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ struct mm_struct *mm = vma->vm_mm; int oldcnt; rwsem_release(&vma->vmlock_dep_map, _RET_IP_); if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { if (is_vma_writer_only(oldcnt - 1)) rcuwait_wake_up(&mm->vma_writer_wait); } } /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. * False locked result is possible if mm_lock_seq overflows or if vma gets * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { int oldcnt; /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) { /* return EAGAIN if vma got detached from under us */ return oldcnt ? NULL : ERR_PTR(-EAGAIN); } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } return vma; } /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { int oldcnt; mmap_assert_locked(vma->vm_mm); if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) return false; rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ static inline bool vma_start_read_locked(struct vm_area_struct *vma) { return vma_start_read_locked_nested(vma, 0); } static inline void vma_end_read(struct vm_area_struct *vma) { vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && !__is_vma_write_locked(vma, &mm_lock_seq), vma); } /* * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these * assertions should be made either under mmap_write_lock or when the object * has been isolated under mmap_write_lock, ensuring no competing writers. */ static inline void vma_assert_attached(struct vm_area_struct *vma) { WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { return false; } static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) { return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} static inline void vma_assert_detached(struct vm_area_struct *vma) {} static inline void vma_mark_attached(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void vma_assert_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); if (!ret) mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } /* * Drop all currently-held per-VMA locks. * This is called from the mmap_lock implementation directly before releasing * a write-locked mmap_lock (or downgrading it to read-locked). * This should normally NOT be called manually from other places. * If you want to call this manually anyway, keep in mind that this will release * *all* VMA write locks, including ones from further up the stack. */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); mm_lock_seqcount_end(mm); } static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); vma_end_write_all(mm); up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { __mmap_lock_trace_acquire_returned(mm, false, true); vma_end_write_all(mm); downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { int ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_killable(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, ret == 0); return ret; } static inline bool mmap_read_trylock(struct mm_struct *mm) { bool ret; __mmap_lock_trace_start_locking(mm, false); ret = down_read_trylock(&mm->mmap_lock) != 0; __mmap_lock_trace_acquire_returned(mm, false, ret); return ret; } static inline void mmap_read_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read(&mm->mmap_lock); } DEFINE_GUARD(mmap_read_lock, struct mm_struct *, mmap_read_lock(_T), mmap_read_unlock(_T)) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); up_read_non_owner(&mm->mmap_lock); } static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); } #endif /* _LINUX_MMAP_LOCK_H */
102 295 296 296 296 296 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the interrupt descriptor management code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "internals.h" /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; } __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) { } #endif #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif return 0; } static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; cpumask_copy(desc->irq_common_data.affinity, affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif } static void free_masks(struct irq_desc *desc) { #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif } #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { int cpu; desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } static unsigned int nr_irqs = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { return nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); /** * irq_set_nr_irqs() - Set the number of interrupts supported by the system. * @nr: New number of interrupts. * * Return: @nr. */ unsigned int irq_set_nr_irqs(unsigned int nr) { nr_irqs = nr; return nr; } EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU, sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { MA_STATE(mas, &sparse_irqs, 0, 0); if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) return -ENOSPC; return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; struct irq_desc *desc; guard(rcu)(); desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { MA_STATE(mas, &sparse_irqs, irq, irq); WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); } static void delete_irq_desc(unsigned int irq) { MA_STATE(mas, &sparse_irqs, irq, irq); mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ static const struct kobj_type irq_kobj_type; #endif static int init_desc(struct irq_desc *desc, int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; if (alloc_masks(desc, node)) { free_percpu(desc->kstat_irqs); return -ENOMEM; } raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); irq_resend_init(desc); #ifdef CONFIG_SPARSE_IRQ kobject_init(&desc->kobj, &irq_kobj_type); init_rcu_head(&desc->rcu); #endif return 0; } #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); #ifdef CONFIG_SYSFS static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; char *p = ""; int cpu; for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip && desc->irq_data.chip->name) return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); return 0; } IRQ_ATTR_RO(chip_name); static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); return 0; } IRQ_ATTR_RO(hwirq); static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->name) return sysfs_emit(buf, "%s\n", desc->name); return 0; } IRQ_ATTR_RO(name); static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; scoped_guard(raw_spinlock_irq, &desc->lock) { for_each_action_of_desc(desc, action) { ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); p = ","; } } if (ret) ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); static struct attribute *irq_attrs[] = { &per_cpu_count_attr.attr, &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL }; ATTRIBUTE_GROUPS(irq); static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) { if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing * crucial and failures in the late irq_sysfs_init() * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); else desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* * Only invoke kobject_del() when kobject_add() was successfully * invoked for the descriptor. This covers both early boot, where * sysfs is not initialized yet, and the case of a failed * kobject_add() invocation. */ if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } static int __init irq_sysfs_init(void) { struct irq_desc *desc; int irq; /* Prevent concurrent irq alloc/free */ guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); if (!irq_kobj_base) return -ENOMEM; /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); return 0; } postcore_initcall(irq_sysfs_init); #else /* !CONFIG_SYSFS */ static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); } void irq_unlock_sparse(void) { mutex_unlock(&sparse_irq_lock); } static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; ret = init_desc(desc, irq, node, flags, affinity, owner); if (unlikely(ret)) { kfree(desc); return NULL; } return desc; } static void irq_kobj_release(struct kobject *kobj) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } static void delayed_free_desc(struct rcu_head *rhp) { struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); kobject_put(&desc->kobj); } static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* * sparse_irq_lock protects also show_interrupts() and * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. * * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ irq_sysfs_del(desc); delete_irq_desc(irq); /* * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { struct irq_desc *desc; int i; /* Validate affinity mask(s) */ if (affinity) { for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; unsigned int flags = 0; if (affinity) { if (affinity->is_managed) { flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } return start; err: for (i--; i >= 0; i--) free_desc(start + i); return -ENOMEM; } static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return false; nr_irqs = nr; return true; } int __init early_irq_init(void) { int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } return arch_early_irq_init(); } #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; int __init early_irq_init(void) { int count, i, node = first_online_node; int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); if (unlikely(ret)) goto __free_desc_res; } return arch_early_irq_init(); __free_desc_res: while (--i >= 0) { free_masks(irq_desc + i); free_percpu(irq_desc[i].kstat_irqs); } return ret; } struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); scoped_guard(raw_spinlock_irqsave, &desc->lock) desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; for (i = 0; i < cnt; i++) { struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; irq_insert_desc(start + i, desc); } return start; } static inline bool irq_expand_nr_irqs(unsigned int nr) { return false; } void irq_mark_irq(unsigned int irq) { guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq) { free_desc(irq); } #endif #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) { struct irq_data *data; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); return 0; } /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); } EXPORT_SYMBOL_GPL(generic_handle_irq); /** * generic_handle_irq_safe - Invoke the handler for a particular irq from any * context. * @irq: The irq number to handle * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process context). It * will report an error if not invoked from IRQ context and the irq has been * marked to enforce IRQ-context only. */ int generic_handle_irq_safe(unsigned int irq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_to_desc(irq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_irq_safe); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); /** * generic_handle_irq_safe - Invoke the handler for a HW irq belonging * to a domain from any context. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an NMI context with irq regs * initialized. **/ int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #endif /* Dynamic interrupt handling */ /** * irq_free_descs - free irq descriptors * @from: Start of descriptor range * @cnt: Number of consecutive irqs to free */ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) return; guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); } EXPORT_SYMBOL_GPL(irq_free_descs); /** * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) * @affinity: Optional pointer to an affinity mask array of size @cnt which * hints where the irq descriptors should be allocated and which * default affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity) { int start; if (!cnt) return -EINVAL; if (irq >= 0) { if (from > irq) return -EINVAL; from = irq; } else { /* * For interrupts which are freely allocated the * architecture can force a lower bound to the @from * argument. x86 uses this to exclude the GSI space. */ from = arch_dynirq_lower_bound(from); } guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); if (irq >=0 && start != irq) return -EEXIST; if (start + cnt > nr_irqs) { if (!irq_expand_nr_irqs(start + cnt)) return -ENOMEM; } return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { return irq_find_at_or_after(offset); } struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check) { struct irq_desc *desc; desc = irq_to_desc(irq); if (!desc) return NULL; if (check & _IRQ_DESC_CHECK) { if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) return NULL; if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) return NULL; } if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); return desc; } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) chip_bus_sync_unlock(desc); } int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); if (!desc->percpu_enabled) return -ENOMEM; desc->percpu_affinity = affinity ? : cpu_possible_mask; irq_set_percpu_devid_flags(irq); return 0; } int irq_set_percpu_devid(unsigned int irq) { return irq_set_percpu_devid_partition(irq, NULL); } int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->percpu_enabled) return -EINVAL; if (affinity) cpumask_copy(affinity, desc->percpu_affinity); return 0; } EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu * @irq: The interrupt number * @cpu: The cpu number * * Returns the sum of interrupt counts on @cpu since boot for * @irq. The caller must ensure that the interrupt is not removed * concurrently. */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return kstat_irqs_desc(desc, cpu_possible_mask); } #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) { struct irq_desc *desc; unsigned int irq; for_each_irq_desc(irq, desc) { if (!desc->kstat_irqs) continue; this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); } } unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); } #endif /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. * * It uses rcu to protect the access since a concurrent removal of an * interrupt descriptor is observing an rcu grace period before * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; rcu_read_lock(); sum = kstat_irqs(irq); rcu_read_unlock(); return sum; } #ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { lockdep_set_class(&desc->lock, lock_class); lockdep_set_class(&desc->request_mutex, request_class); } } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #ifndef __IPVLAN_H #define __IPVLAN_H #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" #define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) #define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) #define IPVLAN_MAC_FILTER_BITS 8 #define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) #define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) #define IPVLAN_QBACKLOG_LIMIT 1000 typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, IPVL_IPV4, IPVL_ARP, } ipvl_hdr_type; struct ipvl_pcpu_stats { u64_stats_t rx_pkts; u64_stats_t rx_bytes; u64_stats_t rx_mcast; u64_stats_t tx_pkts; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errs; u32 tx_drps; }; struct ipvl_port; struct ipvl_dev { struct net_device *dev; struct list_head pnode; struct ipvl_port *port; struct net_device *phy_dev; struct list_head addrs; struct ipvl_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; }; struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; struct rcu_head rcu; }; struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; u16 mode; u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; struct ida ida; netdevice_tracker dev_tracker; }; struct ipvl_skb_cb { bool tx_pkt; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) { return rcu_dereference(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d) { return rcu_dereference_bh(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) { return rtnl_dereference(d->rx_handler_data); } static inline bool ipvlan_is_private(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_PRIVATE); } static inline void ipvlan_mark_private(struct ipvl_port *port) { port->flags |= IPVLAN_F_PRIVATE; } static inline void ipvlan_clear_private(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_PRIVATE; } static inline bool ipvlan_is_vepa(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_VEPA); } static inline void ipvlan_mark_vepa(struct ipvl_port *port) { port->flags |= IPVLAN_F_VEPA; } static inline void ipvlan_clear_vepa(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_VEPA; } void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); void ipvlan_process_multicast(struct work_struct *work); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet); int ipvlan_l3s_init(void); void ipvlan_l3s_cleanup(void); #else static inline int ipvlan_l3s_register(struct ipvl_port *port) { return -ENOTSUPP; } static inline void ipvlan_l3s_unregister(struct ipvl_port *port) { } static inline void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { } static inline int ipvlan_l3s_init(void) { return 0; } static inline void ipvlan_l3s_cleanup(void) { } #endif /* CONFIG_IPVLAN_L3S */ static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame; } #endif /* __IPVLAN_H */
1565 2 1531 54 17 67 1476 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper functions for accessing the file_struct fd array. */ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/posix_types.h> #include <linux/errno.h> #include <linux/cleanup.h> #include <linux/err.h> struct file; extern void fput(struct file *); struct file_operations; struct task_struct; struct vfsmount; struct dentry; struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); /* either a reference to struct file + flags * (cloned vs. borrowed, pos locked), with * flags stored in lower bits of value, * or empty (represented by 0). */ struct fd { unsigned long word; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 #define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK))) static inline bool fd_empty(struct fd f) { return unlikely(!f.word); } #define EMPTY_FD (struct fd){0} static inline struct fd BORROWED_FD(struct file *f) { return (struct fd){(unsigned long)f}; } static inline struct fd CLONED_FD(struct file *f) { return (struct fd){(unsigned long)f | FDPUT_FPUT}; } static inline void fdput(struct fd fd) { if (unlikely(fd.word & FDPUT_FPUT)) fput(fd_file(fd)); } extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd); extern void __f_unlock_pos(struct file *); struct fd fdget(unsigned int fd); struct fd fdget_raw(unsigned int fd); struct fd fdget_pos(unsigned int fd); static inline void fdput_pos(struct fd f) { if (f.word & FDPUT_POS_UNLOCK) __f_unlock_pos(fd_file(f)); fdput(f); } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd) DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile); extern int get_unused_fd_flags(unsigned flags); extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) /* * take_fd() will take care to set @fd to -EBADF ensuring that * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it * easier to rely on CLASS(get_unused_fd): * * struct file *f; * * CLASS(get_unused_fd, fd)(O_CLOEXEC); * if (fd < 0) * return fd; * * f = dentry_open(&path, O_RDONLY, current_cred()); * if (IS_ERR(f)) * return PTR_ERR(f); * * fd_install(fd, f); * return take_fd(fd); */ #define take_fd(fd) __get_and_null(fd, -EBADF) extern void fd_install(unsigned int fd, struct file *file); int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; #endif /* __LINUX_FILE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 // SPDX-License-Identifier: GPL-2.0-only #include <linux/crc32.h> #include <linux/linkage.h> #include <linux/module.h> #include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/neon.h> #include <asm/simd.h> #include <crypto/internal/simd.h> // The minimum input length to consider the 4-way interleaved code path static const size_t min_len = 1024; asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_le_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { kernel_neon_begin(); crc = crc32_le_arm64_4way(crc, p, len); kernel_neon_end(); p += round_down(len, 64); len %= 64; if (!len) return crc; } return crc32_le_arm64(crc, p, len); } EXPORT_SYMBOL(crc32_le_arch); u32 crc32c_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32c_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { kernel_neon_begin(); crc = crc32c_le_arm64_4way(crc, p, len); kernel_neon_end(); p += round_down(len, 64); len %= 64; if (!len) return crc; } return crc32c_le_arm64(crc, p, len); } EXPORT_SYMBOL(crc32c_arch); u32 crc32_be_arch(u32 crc, const u8 *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_be_base(crc, p, len); if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { kernel_neon_begin(); crc = crc32_be_arm64_4way(crc, p, len); kernel_neon_end(); p += round_down(len, 64); len %= 64; if (!len) return crc; } return crc32_be_arm64(crc, p, len); } EXPORT_SYMBOL(crc32_be_arch); u32 crc32_optimizations(void) { if (alternative_has_cap_likely(ARM64_HAS_CRC32)) return CRC32_LE_OPTIMIZATION | CRC32_BE_OPTIMIZATION | CRC32C_OPTIMIZATION; return 0; } EXPORT_SYMBOL(crc32_optimizations); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 // SPDX-License-Identifier: GPL-2.0 #include <linux/debugfs.h> #include "netdevsim.h" #define NSIM_DEV_HWSTATS_TRAFFIC_MS 100 static struct list_head * nsim_dev_hwstats_get_list_head(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return &hwstats->l3_list; } WARN_ON_ONCE(1); return NULL; } static void nsim_dev_hwstats_traffic_bump(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->enabled) { hwsdev->stats.rx_packets += 1; hwsdev->stats.tx_packets += 2; hwsdev->stats.rx_bytes += 100; hwsdev->stats.tx_bytes += 300; } } } static void nsim_dev_hwstats_traffic_work(struct work_struct *work) { struct nsim_dev_hwstats *hwstats; hwstats = container_of(work, struct nsim_dev_hwstats, traffic_dw.work); mutex_lock(&hwstats->hwsdev_list_lock); nsim_dev_hwstats_traffic_bump(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); } static struct nsim_dev_hwstats_netdev * nsim_dev_hwslist_find_hwsdev(struct list_head *hwsdev_list, int ifindex) { struct nsim_dev_hwstats_netdev *hwsdev; list_for_each_entry(hwsdev, hwsdev_list, list) { if (hwsdev->netdev->ifindex == ifindex) return hwsdev; } return NULL; } static int nsim_dev_hwsdev_enable(struct nsim_dev_hwstats_netdev *hwsdev, struct netlink_ext_ack *extack) { if (hwsdev->fail_enable) { hwsdev->fail_enable = false; NL_SET_ERR_MSG_MOD(extack, "Stats enablement set to fail"); return -ECANCELED; } hwsdev->enabled = true; return 0; } static void nsim_dev_hwsdev_disable(struct nsim_dev_hwstats_netdev *hwsdev) { hwsdev->enabled = false; memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); } static int nsim_dev_hwsdev_report_delta(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { netdev_offload_xstats_report_delta(info->report_delta, &hwsdev->stats); memset(&hwsdev->stats, 0, sizeof(hwsdev->stats)); return 0; } static void nsim_dev_hwsdev_report_used(struct nsim_dev_hwstats_netdev *hwsdev, struct netdev_notifier_offload_xstats_info *info) { if (hwsdev->enabled) netdev_offload_xstats_report_used(info->report_used); } static int nsim_dev_hwstats_event_off_xstats(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_offload_xstats_info *info; struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; int err = 0; info = ptr; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, info->type); if (!hwsdev_list) return 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) goto out; switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: err = nsim_dev_hwsdev_enable(hwsdev, info->info.extack); break; case NETDEV_OFFLOAD_XSTATS_DISABLE: nsim_dev_hwsdev_disable(hwsdev); break; case NETDEV_OFFLOAD_XSTATS_REPORT_USED: nsim_dev_hwsdev_report_used(hwsdev, info); break; case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: err = nsim_dev_hwsdev_report_delta(hwsdev, info); break; } out: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_fini(struct nsim_dev_hwstats_netdev *hwsdev) { dev_put(hwsdev->netdev); kfree(hwsdev); } static void __nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex); if (!hwsdev) return; list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } static void nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats, struct net_device *dev) { mutex_lock(&hwstats->hwsdev_list_lock); __nsim_dev_hwstats_event_unregister(hwstats, dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_unlock(&hwstats->hwsdev_list_lock); } static int nsim_dev_hwstats_event(struct nsim_dev_hwstats *hwstats, struct net_device *dev, unsigned long event, void *ptr) { switch (event) { case NETDEV_OFFLOAD_XSTATS_ENABLE: case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: return nsim_dev_hwstats_event_off_xstats(hwstats, dev, event, ptr); case NETDEV_UNREGISTER: nsim_dev_hwstats_event_unregister(hwstats, dev); break; } return 0; } static int nsim_dev_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nsim_dev_hwstats *hwstats; int err = 0; hwstats = container_of(nb, struct nsim_dev_hwstats, netdevice_nb); err = nsim_dev_hwstats_event(hwstats, dev, event, ptr); if (err) return notifier_from_errno(err); return NOTIFY_OK; } static int nsim_dev_hwstats_enable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; struct nsim_dev *nsim_dev; struct net_device *netdev; bool notify = false; struct net *net; int err = 0; nsim_dev = container_of(hwstats, struct nsim_dev, hwstats); net = nsim_dev_net(nsim_dev); rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) goto out_unlock_list; netdev = dev_get_by_index(net, ifindex); if (!netdev) { err = -ENODEV; goto out_unlock_list; } hwsdev = kzalloc(sizeof(*hwsdev), GFP_KERNEL); if (!hwsdev) { err = -ENOMEM; goto out_put_netdev; } hwsdev->netdev = netdev; list_add_tail(&hwsdev->list, hwsdev_list); mutex_unlock(&hwstats->hwsdev_list_lock); if (netdev_offload_xstats_enabled(netdev, type)) { nsim_dev_hwsdev_enable(hwsdev, NULL); notify = true; } if (notify) rtnl_offload_xstats_notify(netdev); rtnl_unlock(); return err; out_put_netdev: dev_put(netdev); out_unlock_list: mutex_unlock(&hwstats->hwsdev_list_lock); rtnl_unlock(); return err; } static int nsim_dev_hwstats_disable_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; rtnl_lock(); mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (hwsdev) list_del(&hwsdev->list); mutex_unlock(&hwstats->hwsdev_list_lock); if (!hwsdev) { err = -ENOENT; goto unlock_out; } if (netdev_offload_xstats_enabled(hwsdev->netdev, type)) { netdev_offload_xstats_push_delta(hwsdev->netdev, type, &hwsdev->stats); rtnl_offload_xstats_notify(hwsdev->netdev); } nsim_dev_hwsdev_fini(hwsdev); unlock_out: rtnl_unlock(); return err; } static int nsim_dev_hwstats_fail_ifindex(struct nsim_dev_hwstats *hwstats, int ifindex, enum netdev_offload_xstats_type type, struct list_head *hwsdev_list) { struct nsim_dev_hwstats_netdev *hwsdev; int err = 0; mutex_lock(&hwstats->hwsdev_list_lock); hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex); if (!hwsdev) { err = -ENOENT; goto err_hwsdev_list_unlock; } hwsdev->fail_enable = true; err_hwsdev_list_unlock: mutex_unlock(&hwstats->hwsdev_list_lock); return err; } enum nsim_dev_hwstats_do { NSIM_DEV_HWSTATS_DO_DISABLE, NSIM_DEV_HWSTATS_DO_ENABLE, NSIM_DEV_HWSTATS_DO_FAIL, }; struct nsim_dev_hwstats_fops { enum nsim_dev_hwstats_do action; enum netdev_offload_xstats_type type; }; static ssize_t nsim_dev_hwstats_do_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev_hwstats *hwstats = file->private_data; const struct nsim_dev_hwstats_fops *hwsfops; struct list_head *hwsdev_list; int ifindex; int err; hwsfops = debugfs_get_aux(file); err = kstrtoint_from_user(data, count, 0, &ifindex); if (err) return err; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, hwsfops->type); if (WARN_ON(!hwsdev_list)) return -EINVAL; switch (hwsfops->action) { case NSIM_DEV_HWSTATS_DO_DISABLE: err = nsim_dev_hwstats_disable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_ENABLE: err = nsim_dev_hwstats_enable_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; case NSIM_DEV_HWSTATS_DO_FAIL: err = nsim_dev_hwstats_fail_ifindex(hwstats, ifindex, hwsfops->type, hwsdev_list); break; } if (err) return err; return count; } static struct debugfs_short_fops debugfs_ops = { .write = nsim_dev_hwstats_do_write, .llseek = generic_file_llseek, }; #define NSIM_DEV_HWSTATS_FOPS(ACTION, TYPE) \ { \ .action = ACTION, \ .type = TYPE, \ } static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_disable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_DISABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_enable_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_ENABLE, NETDEV_OFFLOAD_XSTATS_TYPE_L3); static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_fail_fops = NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_FAIL, NETDEV_OFFLOAD_XSTATS_TYPE_L3); #undef NSIM_DEV_HWSTATS_FOPS int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); int err; mutex_init(&hwstats->hwsdev_list_lock); INIT_LIST_HEAD(&hwstats->l3_list); hwstats->netdevice_nb.notifier_call = nsim_dev_netdevice_event; err = register_netdevice_notifier_net(net, &hwstats->netdevice_nb); if (err) goto err_mutex_destroy; hwstats->ddir = debugfs_create_dir("hwstats", nsim_dev->ddir); if (IS_ERR(hwstats->ddir)) { err = PTR_ERR(hwstats->ddir); goto err_unregister_notifier; } hwstats->l3_ddir = debugfs_create_dir("l3", hwstats->ddir); if (IS_ERR(hwstats->l3_ddir)) { err = PTR_ERR(hwstats->l3_ddir); goto err_remove_hwstats_recursive; } debugfs_create_file_aux("enable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_enable_fops, &debugfs_ops); debugfs_create_file_aux("disable_ifindex", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_disable_fops, &debugfs_ops); debugfs_create_file_aux("fail_next_enable", 0200, hwstats->l3_ddir, hwstats, &nsim_dev_hwstats_l3_fail_fops, &debugfs_ops); INIT_DELAYED_WORK(&hwstats->traffic_dw, &nsim_dev_hwstats_traffic_work); schedule_delayed_work(&hwstats->traffic_dw, msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS)); return 0; err_remove_hwstats_recursive: debugfs_remove_recursive(hwstats->ddir); err_unregister_notifier: unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); err_mutex_destroy: mutex_destroy(&hwstats->hwsdev_list_lock); return err; } static void nsim_dev_hwsdev_list_wipe(struct nsim_dev_hwstats *hwstats, enum netdev_offload_xstats_type type) { struct nsim_dev_hwstats_netdev *hwsdev, *tmp; struct list_head *hwsdev_list; hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type); if (WARN_ON(!hwsdev_list)) return; mutex_lock(&hwstats->hwsdev_list_lock); list_for_each_entry_safe(hwsdev, tmp, hwsdev_list, list) { list_del(&hwsdev->list); nsim_dev_hwsdev_fini(hwsdev); } mutex_unlock(&hwstats->hwsdev_list_lock); } void nsim_dev_hwstats_exit(struct nsim_dev *nsim_dev) { struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats; struct net *net = nsim_dev_net(nsim_dev); cancel_delayed_work_sync(&hwstats->traffic_dw); debugfs_remove_recursive(hwstats->ddir); unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb); nsim_dev_hwsdev_list_wipe(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3); mutex_destroy(&hwstats->hwsdev_list_lock); }
21 428 197 3 232 10 9 46 44 3 31 32 31 10 294 16 337 82 83 82 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/include/asm/kvm_host.h: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ #include <linux/arm-smccc.h> #include <linux/bitmap.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kvm_types.h> #include <linux/maple_tree.h> #include <linux/percpu.h> #include <linux/psci.h> #include <asm/arch_gicv3.h> #include <asm/barrier.h> #include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/fpsimd.h> #include <asm/kvm.h> #include <asm/kvm_asm.h> #include <asm/vncr_mapping.h> #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_HALT_POLL_NS_DEFAULT 500000 #include <kvm/arm_vgic.h> #include <kvm/arm_arch_timer.h> #include <kvm/arm_pmu.h> #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS #define KVM_VCPU_MAX_FEATURES 9 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_REQ_GUEST_HYP_IRQ_PENDING KVM_ARCH_REQ(9) #define KVM_REQ_MAP_L1_VNCR_EL2 KVM_ARCH_REQ(10) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) #define KVM_HAVE_MMU_RWLOCK /* * Mode of operation configurable with kvm-arm.mode early param. * See Documentation/admin-guide/kernel-parameters.txt for more information. */ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, KVM_MODE_NV, KVM_MODE_NONE, }; #ifdef CONFIG_KVM enum kvm_mode kvm_get_mode(void); #else static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif extern unsigned int __ro_after_init kvm_sve_max_vl; extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); void kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); struct kvm_hyp_memcache { phys_addr_t head; unsigned long nr_pages; struct pkvm_mapping *mapping; /* only used from EL1 */ #define HYP_MEMCACHE_ACCOUNT_STAGE2 BIT(1) unsigned long flags; }; static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, phys_addr_t *p, phys_addr_t (*to_pa)(void *virt)) { *p = mc->head; mc->head = to_pa(p); mc->nr_pages++; } static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, void *(*to_va)(phys_addr_t phys)) { phys_addr_t *p = to_va(mc->head & PAGE_MASK); if (!mc->nr_pages) return NULL; mc->head = *p; mc->nr_pages--; return p; } static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, void *(*alloc_fn)(void *arg), phys_addr_t (*to_pa)(void *virt), void *arg) { while (mc->nr_pages < min_pages) { phys_addr_t *p = alloc_fn(arg); if (!p) return -ENOMEM; push_hyp_memcache(mc, p, to_pa); } return 0; } static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc, void (*free_fn)(void *virt, void *arg), void *(*to_va)(phys_addr_t phys), void *arg) { while (mc->nr_pages) free_fn(pop_hyp_memcache(mc, to_va), arg); } void free_hyp_memcache(struct kvm_hyp_memcache *mc); int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages); struct kvm_vmid { atomic64_t id; }; struct kvm_s2_mmu { struct kvm_vmid vmid; /* * stage2 entry level table * * Two kvm_s2_mmu structures in the same VM can point to the same * pgd here. This happens when running a guest using a * translation regime that isn't affected by its own stage-2 * translation, such as a non-VHE hypervisor running at vEL2, or * for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the * canonical stage-2 page tables. */ phys_addr_t pgd_phys; struct kvm_pgtable *pgt; /* * VTCR value used on the host. For a non-NV guest (or a NV * guest that runs in a context where its own S2 doesn't * apply), its T0SZ value reflects that of the IPA size. * * For a shadow S2 MMU, T0SZ reflects the PARange exposed to * the guest. */ u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; #define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 /* * Memory cache used to split * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It * is used to allocate stage2 page tables while splitting huge * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE * influences both the capacity of the split page cache, and * how often KVM reschedules. Be wary of raising CHUNK_SIZE * too high. * * Protected by kvm->slots_lock. */ struct kvm_mmu_memory_cache split_page_cache; uint64_t split_page_chunk_size; struct kvm_arch *arch; /* * For a shadow stage-2 MMU, the virtual vttbr used by the * host to parse the guest S2. * This either contains: * - the virtual VTTBR programmed by the guest hypervisor with * CnP cleared * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid * * We also cache the full VTCR which gets used for TLB invalidation, * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted * to be cached in a TLB" to the letter. */ u64 tlb_vttbr; u64 tlb_vtcr; /* * true when this represents a nested context where virtual * HCR_EL2.VM == 1 */ bool nested_stage2_enabled; /* * true when this MMU needs to be unmapped before being used for a new * purpose. */ bool pending_unmap; /* * 0: Nobody is currently using this, check vttbr for validity * >0: Somebody is actively using this. */ atomic_t refcnt; }; struct kvm_arch_memory_slot { }; /** * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests * * @std_bmap: Bitmap of standard secure service calls * @std_hyp_bmap: Bitmap of standard hypervisor service calls * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls */ struct kvm_smccc_features { unsigned long std_bmap; unsigned long std_hyp_bmap; unsigned long vendor_hyp_bmap; /* Function numbers 0-63 */ unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */ }; typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache teardown_mc; struct kvm_hyp_memcache stage2_teardown_mc; bool enabled; }; struct kvm_mpidr_data { u64 mpidr_mask; DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx); }; static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr) { unsigned long index = 0, mask = data->mpidr_mask; unsigned long aff = mpidr & MPIDR_HWID_BITMASK; bitmap_gather(&index, &aff, &mask, fls(mask)); return index; } struct kvm_sysreg_masks; enum fgt_group_id { __NO_FGT_GROUP__, HFGRTR_GROUP, HFGWTR_GROUP = HFGRTR_GROUP, HDFGRTR_GROUP, HDFGWTR_GROUP = HDFGRTR_GROUP, HFGITR_GROUP, HAFGRTR_GROUP, HFGRTR2_GROUP, HFGWTR2_GROUP = HFGRTR2_GROUP, HDFGRTR2_GROUP, HDFGWTR2_GROUP = HDFGRTR2_GROUP, HFGITR2_GROUP, /* Must be last */ __NR_FGT_GROUP_IDS__ }; struct kvm_arch { struct kvm_s2_mmu mmu; /* * Fine-Grained UNDEF, mimicking the FGT layout defined by the * architecture. We track them globally, as we present the * same feature-set to all vcpus. * * Index 0 is currently spare. */ u64 fgu[__NR_FGT_GROUP_IDS__]; /* * Stage 2 paging state for VMs with nested S2 using a virtual * VMID. */ struct kvm_s2_mmu *nested_mmus; size_t nested_mmus_size; int nested_mmus_next; /* Interrupt controller */ struct vgic_dist vgic; /* Timers */ struct arch_timer_vm_data timer_data; /* Mandated version of PSCI */ u32 psci_version; /* Protects VM-scoped configuration data */ struct mutex config_lock; /* * If we encounter a data abort without valid instruction syndrome * information, report this to user space. User space can (and * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is * supported. */ #define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0 /* Memory Tagging Extension enabled for the guest */ #define KVM_ARCH_FLAG_MTE_ENABLED 1 /* At least one vCPU has ran in the VM */ #define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 /* The vCPU feature set for the VM is configured */ #define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3 /* PSCI SYSTEM_SUSPEND enabled for the guest */ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4 /* VM counter offset */ #define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5 /* Timer PPIs made immutable */ #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 /* Initial ID reg values loaded */ #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 7 /* Fine-Grained UNDEF initialised */ #define KVM_ARCH_FLAG_FGU_INITIALIZED 8 /* SVE exposed to guest */ #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 unsigned long flags; /* VM-wide vCPU feature set */ DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES); /* MPIDR to vcpu index mapping, optional */ struct kvm_mpidr_data *mpidr_data; /* * VM-wide PMU filter, implemented as a bitmap and big enough for * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). */ unsigned long *pmu_filter; struct arm_pmu *arm_pmu; cpumask_var_t supported_cpus; /* Maximum number of counters for the guest */ u8 nr_pmu_counters; /* Iterator for idreg debugfs */ u8 idreg_debugfs_iter; /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; /* * Emulated CPU ID registers per VM * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. * * These emulated idregs are VM-wide, but accessed from the context of a vCPU. * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. */ #define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; u64 midr_el1; u64 revidr_el1; u64 aidr_el1; u64 ctr_el0; /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; /* Count the number of VNCR_EL2 currently mapped */ atomic_t vncr_map_count; /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; }; struct kvm_vcpu_fault_info { u64 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ u64 hpfar_el2; /* Hyp IPA Fault Address Register */ u64 disr_el1; /* Deferred [SError] Status Register */ }; /* * VNCR() just places the VNCR_capable registers in the enum after * __VNCR_START__, and the value (after correction) to be an 8-byte offset * from the VNCR base. As we don't require the enum to be otherwise ordered, * we need the terrible hack below to ensure that we correctly size the * sys_regs array, no matter what. * * The __MAX__ macro has been lifted from Sean Eron Anderson's wonderful * treasure trove of bit hacks: * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax */ #define __MAX__(x,y) ((x) ^ (((x) ^ (y)) & -((x) < (y)))) #define VNCR(r) \ __before_##r, \ r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ __after_##r = __MAX__(__before_##r - 1, r) #define MARKER(m) \ m, __after_##m = m - 1 enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ CLIDR_EL1, /* Cache Level ID Register */ CSSELR_EL1, /* Cache Size Selection Register */ TPIDR_EL0, /* Thread ID, User R/W */ TPIDRRO_EL0, /* Thread ID, User R/O */ TPIDR_EL1, /* Thread ID, Privileged */ CNTKCTL_EL1, /* Timer Control Register (EL1) */ PAR_EL1, /* Physical Address Register */ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */ OSLSR_EL1, /* OS Lock Status Register */ DISR_EL1, /* Deferred Interrupt Status Register */ /* Performance Monitors Registers */ PMCR_EL0, /* Control Register */ PMSELR_EL0, /* Event Counter Selection Register */ PMEVCNTR0_EL0, /* Event Counter Register (0-30) */ PMEVCNTR30_EL0 = PMEVCNTR0_EL0 + 30, PMCCNTR_EL0, /* Cycle Counter Register */ PMEVTYPER0_EL0, /* Event Type Register (0-30) */ PMEVTYPER30_EL0 = PMEVTYPER0_EL0 + 30, PMCCFILTR_EL0, /* Cycle Count Filter Register */ PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ APIAKEYLO_EL1, APIAKEYHI_EL1, APIBKEYLO_EL1, APIBKEYHI_EL1, APDAKEYLO_EL1, APDAKEYHI_EL1, APDBKEYLO_EL1, APDBKEYHI_EL1, APGAKEYLO_EL1, APGAKEYHI_EL1, /* Memory Tagging Extension registers */ RGSR_EL1, /* Random Allocation Tag Seed Register */ GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ POR_EL0, /* Permission Overlay Register 0 (EL0) */ /* FP/SIMD/SVE */ SVCR, FPMR, /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ FPEXC32_EL2, /* Floating-Point Exception Control Register */ DBGVCR32_EL2, /* Debug Vector Catch Register */ /* EL2 registers */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ ZCR_EL2, /* SVE Control Register (EL2) */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ PIRE0_EL2, /* Permission Indirection Register 0 (EL2) */ PIR_EL2, /* Permission Indirection Register 1 (EL2) */ POR_EL2, /* Permission Overlay Register 2 (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ AFSR1_EL2, /* Auxiliary Fault Status Register 1 (EL2) */ ESR_EL2, /* Exception Syndrome Register (EL2) */ FAR_EL2, /* Fault Address Register (EL2) */ HPFAR_EL2, /* Hypervisor IPA Fault Address Register */ MAIR_EL2, /* Memory Attribute Indirection Register (EL2) */ AMAIR_EL2, /* Auxiliary Memory Attribute Indirection Register (EL2) */ VBAR_EL2, /* Vector Base Address Register (EL2) */ RVBAR_EL2, /* Reset Vector Base Address Register */ CONTEXTIDR_EL2, /* Context ID Register (EL2) */ SP_EL2, /* EL2 Stack Pointer */ CNTHP_CTL_EL2, CNTHP_CVAL_EL2, CNTHV_CTL_EL2, CNTHV_CVAL_EL2, /* Anything from this can be RES0/RES1 sanitised */ MARKER(__SANITISED_REG_START__), TCR2_EL2, /* Extended Translation Control Register (EL2) */ SCTLR2_EL2, /* System Control Register 2 (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), VNCR(SCTLR_EL1),/* System Control Register */ VNCR(ACTLR_EL1),/* Auxiliary Control Register */ VNCR(CPACR_EL1),/* Coprocessor Access Control */ VNCR(ZCR_EL1), /* SVE Control */ VNCR(TTBR0_EL1),/* Translation Table Base Register 0 */ VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */ VNCR(TCR_EL1), /* Translation Control Register */ VNCR(TCR2_EL1), /* Extended Translation Control Register */ VNCR(SCTLR2_EL1), /* System Control Register 2 */ VNCR(ESR_EL1), /* Exception Syndrome Register */ VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */ VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */ VNCR(FAR_EL1), /* Fault Address Register */ VNCR(MAIR_EL1), /* Memory Attribute Indirection Register */ VNCR(VBAR_EL1), /* Vector Base Address Register */ VNCR(CONTEXTIDR_EL1), /* Context ID Register */ VNCR(AMAIR_EL1),/* Aux Memory Attribute Indirection Register */ VNCR(MDSCR_EL1),/* Monitor Debug System Control Register */ VNCR(ELR_EL1), VNCR(SP_EL1), VNCR(SPSR_EL1), VNCR(TFSR_EL1), /* Tag Fault Status Register (EL1) */ VNCR(VPIDR_EL2),/* Virtualization Processor ID Register */ VNCR(VMPIDR_EL2),/* Virtualization Multiprocessor ID Register */ VNCR(HCR_EL2), /* Hypervisor Configuration Register */ VNCR(HSTR_EL2), /* Hypervisor System Trap Register */ VNCR(VTTBR_EL2),/* Virtualization Translation Table Base Register */ VNCR(VTCR_EL2), /* Virtualization Translation Control Register */ VNCR(TPIDR_EL2),/* EL2 Software Thread ID Register */ VNCR(HCRX_EL2), /* Extended Hypervisor Configuration Register */ /* Permission Indirection Extension registers */ VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ /* FEAT_RAS registers */ VNCR(VDISR_EL2), VNCR(VSESR_EL2), VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), VNCR(HDFGRTR_EL2), VNCR(HDFGWTR_EL2), VNCR(HAFGRTR_EL2), VNCR(HFGRTR2_EL2), VNCR(HFGWTR2_EL2), VNCR(HFGITR2_EL2), VNCR(HDFGRTR2_EL2), VNCR(HDFGWTR2_EL2), VNCR(VNCR_EL2), VNCR(CNTVOFF_EL2), VNCR(CNTV_CVAL_EL0), VNCR(CNTV_CTL_EL0), VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), VNCR(ICH_LR0_EL2), VNCR(ICH_LR1_EL2), VNCR(ICH_LR2_EL2), VNCR(ICH_LR3_EL2), VNCR(ICH_LR4_EL2), VNCR(ICH_LR5_EL2), VNCR(ICH_LR6_EL2), VNCR(ICH_LR7_EL2), VNCR(ICH_LR8_EL2), VNCR(ICH_LR9_EL2), VNCR(ICH_LR10_EL2), VNCR(ICH_LR11_EL2), VNCR(ICH_LR12_EL2), VNCR(ICH_LR13_EL2), VNCR(ICH_LR14_EL2), VNCR(ICH_LR15_EL2), VNCR(ICH_AP0R0_EL2), VNCR(ICH_AP0R1_EL2), VNCR(ICH_AP0R2_EL2), VNCR(ICH_AP0R3_EL2), VNCR(ICH_AP1R0_EL2), VNCR(ICH_AP1R1_EL2), VNCR(ICH_AP1R2_EL2), VNCR(ICH_AP1R3_EL2), VNCR(ICH_HCR_EL2), VNCR(ICH_VMCR_EL2), NR_SYS_REGS /* Nothing after this line! */ }; struct kvm_sysreg_masks { struct { u64 res0; u64 res1; } mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; struct fgt_masks { const char *str; u64 mask; u64 nmask; u64 res0; }; extern struct fgt_masks hfgrtr_masks; extern struct fgt_masks hfgwtr_masks; extern struct fgt_masks hfgitr_masks; extern struct fgt_masks hdfgrtr_masks; extern struct fgt_masks hdfgwtr_masks; extern struct fgt_masks hafgrtr_masks; extern struct fgt_masks hfgrtr2_masks; extern struct fgt_masks hfgwtr2_masks; extern struct fgt_masks hfgitr2_masks; extern struct fgt_masks hdfgrtr2_masks; extern struct fgt_masks hdfgwtr2_masks; extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks); extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks); extern struct fgt_masks kvm_nvhe_sym(hfgitr_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgrtr_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgwtr_masks); extern struct fgt_masks kvm_nvhe_sym(hafgrtr_masks); extern struct fgt_masks kvm_nvhe_sym(hfgrtr2_masks); extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks); extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks); extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks); struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ u64 spsr_abt; u64 spsr_und; u64 spsr_irq; u64 spsr_fiq; struct user_fpsimd_state fp_regs; u64 sys_regs[NR_SYS_REGS]; struct kvm_vcpu *__hyp_running_vcpu; /* This pointer has to be 4kB aligned. */ u64 *vncr_array; }; struct cpu_sve_state { __u64 zcr_el1; /* * Ordering is important since __sve_save_state/__sve_restore_state * relies on it. */ __u32 fpsr; __u32 fpcr; /* Must be SVE_VQ_BYTES (128 bit) aligned. */ __u8 sve_regs[]; }; /* * This structure is instantiated on a per-CPU basis, and contains * data that is: * * - tied to a single physical CPU, and * - either have a lifetime that does not extend past vcpu_put() * - or is an invariant for the lifetime of the system * * Use host_data_ptr(field) as a way to access a pointer to such a * field. */ struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 #define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT 6 #define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED 7 unsigned long flags; struct kvm_cpu_context host_ctxt; /* * Hyp VA. * sve_state is only used in pKVM and if system_supports_sve(). */ struct cpu_sve_state *sve_state; /* Used by pKVM only. */ u64 fpmr; /* Ownership of the FP regs */ enum { FP_STATE_FREE, FP_STATE_HOST_OWNED, FP_STATE_GUEST_OWNED, } fp_owner; /* * host_debug_state contains the host registers which are * saved and restored during world switches. */ struct { /* {Break,watch}point registers */ struct kvm_guest_debug_arch regs; /* Statistical profiling extension */ u64 pmscr_el1; /* Self-hosted trace */ u64 trfcr_el1; /* Values of trap registers for the host before guest entry. */ u64 mdcr_el2; } host_debug_state; /* Guest trace filter value */ u64 trfcr_while_in_guest; /* Number of programmable event counters (PMCR_EL0.N) for this CPU */ unsigned int nr_event_counters; /* Number of debug breakpoints/watchpoints for this CPU (minus 1) */ unsigned int debug_brps; unsigned int debug_wrps; }; struct kvm_host_psci_config { /* PSCI version used by host. */ u32 version; u32 smccc_version; /* Function IDs used by host if version is v0.1. */ struct psci_0_1_function_ids function_ids_0_1; bool psci_0_1_cpu_suspend_implemented; bool psci_0_1_cpu_on_implemented; bool psci_0_1_cpu_off_implemented; bool psci_0_1_migrate_implemented; }; extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config); #define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config) extern s64 kvm_nvhe_sym(hyp_physvirt_offset); #define hyp_physvirt_offset CHOOSE_NVHE_SYM(hyp_physvirt_offset) extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS]; #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map) struct vcpu_reset_state { unsigned long pc; unsigned long r0; bool be; bool reset; }; struct vncr_tlb; struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; /* * Guest floating point state * * The architecture has two main floating point extensions, * the original FPSIMD and SVE. These have overlapping * register views, with the FPSIMD V registers occupying the * low 128 bits of the SVE Z registers. When the core * floating point code saves the register state of a task it * records which view it saved in fp_type. */ void *sve_state; enum fp_type fp_type; unsigned int sve_max_vl; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; /* Values of trap registers for the guest. */ u64 hcr_el2; u64 hcrx_el2; u64 mdcr_el2; /* Exception Information */ struct kvm_vcpu_fault_info fault; /* Configuration flags, set once and for all before the vcpu can run */ u8 cflags; /* Input flags to the hypervisor code, potentially cleared after use */ u8 iflags; /* State flags for kernel bookkeeping, unused by the hypervisor code */ u16 sflags; /* * Don't run the guest (internal implementation need). * * Contrary to the flags above, this is set/cleared outside of * a vcpu context, and thus cannot be mixed with the flags * themselves (or the flag accesses need to be made atomic). */ bool pause; /* * We maintain more than a single set of debug registers to support * debugging the guest from the host and to maintain separate host and * guest state during world switches. vcpu_debug_state are the debug * registers of the vcpu as the guest sees them. * * external_debug_state contains the debug values we want to debug the * guest. This is set via the KVM_SET_GUEST_DEBUG ioctl. */ struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; u64 external_mdscr_el1; enum { VCPU_DEBUG_FREE, VCPU_DEBUG_HOST_OWNED, VCPU_DEBUG_GUEST_OWNED, } debug_owner; /* VGIC state */ struct vgic_cpu vgic_cpu; struct arch_timer_cpu timer_cpu; struct kvm_pmu pmu; /* vcpu power state */ struct kvm_mp_state mp_state; spinlock_t mp_state_lock; /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; /* Pages to top-up the pKVM/EL2 guest pool */ struct kvm_hyp_memcache pkvm_memcache; /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ u64 vsesr_el2; /* Additional reset state */ struct vcpu_reset_state reset_state; /* Guest PV state */ struct { u64 last_steal; gpa_t base; } steal; /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */ struct vncr_tlb *vncr_tlb; }; /* * Each 'flag' is composed of a comma-separated triplet: * * - the flag-set it belongs to in the vcpu->arch structure * - the value for that flag * - the mask for that flag * * __vcpu_single_flag() builds such a triplet for a single-bit flag. * unpack_vcpu_flag() extract the flag value from the triplet for * direct use outside of the flag accessors. */ #define __vcpu_single_flag(_set, _f) _set, (_f), (_f) #define __unpack_flag(_set, _f, _m) _f #define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) #define __build_check_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *_fset; \ \ /* Check that the flags fit in the mask */ \ BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ /* Check that the flags fit in the type */ \ BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ } while (0) #define __vcpu_get_flag(v, flagset, f, m) \ ({ \ __build_check_flag(v, flagset, f, m); \ \ READ_ONCE(v->arch.flagset) & (m); \ }) /* * Note that the set/clear accessors must be preempt-safe in order to * avoid nesting them with load/put which also manipulate flags... */ #ifdef __KVM_NVHE_HYPERVISOR__ /* the nVHE hypervisor is always non-preemptible */ #define __vcpu_flags_preempt_disable() #define __vcpu_flags_preempt_enable() #else #define __vcpu_flags_preempt_disable() preempt_disable() #define __vcpu_flags_preempt_enable() preempt_enable() #endif #define __vcpu_set_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *fset; \ \ __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ __vcpu_flags_preempt_disable(); \ if (HWEIGHT(m) > 1) \ *fset &= ~(m); \ *fset |= (f); \ __vcpu_flags_preempt_enable(); \ } while (0) #define __vcpu_clear_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *fset; \ \ __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ __vcpu_flags_preempt_disable(); \ *fset &= ~(m); \ __vcpu_flags_preempt_enable(); \ } while (0) #define __vcpu_test_and_clear_flag(v, flagset, f, m) \ ({ \ typeof(v->arch.flagset) set; \ \ set = __vcpu_get_flag(v, flagset, f, m); \ __vcpu_clear_flag(v, flagset, f, m); \ \ set; \ }) #define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) #define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) #define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) #define vcpu_test_and_clear_flag(v, ...) \ __vcpu_test_and_clear_flag((v), __VA_ARGS__) /* KVM_ARM_VCPU_INIT completed */ #define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0)) /* SVE config completed */ #define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) /* pKVM VCPU setup completed */ #define VCPU_PKVM_FINALIZED __vcpu_single_flag(cflags, BIT(2)) /* Exception pending */ #define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) /* * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't * be set together with an exception... */ #define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) /* Target EL/MODE (not a single flag, but let's abuse the macro) */ #define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) /* Helpers to encode exceptions with minimum fuss */ #define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) #define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) #define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL /* * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following * values: * * For AArch32 EL1: */ #define EXCEPT_AA32_UND __vcpu_except_flags(0) #define EXCEPT_AA32_IABT __vcpu_except_flags(1) #define EXCEPT_AA32_DABT __vcpu_except_flags(2) /* For AArch64: */ #define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) #define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) #define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) #define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) /* For AArch64 with NV: */ #define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) #define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) #define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) #define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) /* Physical CPU not in supported_cpus */ #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(0)) /* WFIT instruction trapped */ #define IN_WFIT __vcpu_single_flag(sflags, BIT(1)) /* vcpu system registers loaded on physical CPU */ #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(2)) /* Software step state is Active-pending for external debug */ #define HOST_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(3)) /* Software step state is Active pending for guest debug */ #define GUEST_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(4)) /* PMUSERENR for the guest EL0 is on physical CPU */ #define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(5)) /* WFI instruction trapped */ #define IN_WFI __vcpu_single_flag(sflags, BIT(6)) /* KVM is currently emulating a nested ERET */ #define IN_NESTED_ERET __vcpu_single_flag(sflags, BIT(7)) /* SError pending for nested guest */ #define NESTED_SERROR_PENDING __vcpu_single_flag(sflags, BIT(8)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) #define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl) #define vcpu_sve_zcr_elx(vcpu) \ (unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1) #define sve_state_size_from_vl(sve_max_vl) ({ \ size_t __size_ret; \ unsigned int __vq; \ \ if (WARN_ON(!sve_vl_valid(sve_max_vl))) { \ __size_ret = 0; \ } else { \ __vq = sve_vq_from_vl(sve_max_vl); \ __size_ret = SVE_SIG_REGS_SIZE(__vq); \ } \ \ __size_ret; \ }) #define vcpu_sve_state_size(vcpu) sve_state_size_from_vl((vcpu)->arch.sve_max_vl) #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) #define kvm_has_sve(kvm) (system_supports_sve() && \ test_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &(kvm)->arch.flags)) #ifdef __KVM_NVHE_HYPERVISOR__ #define vcpu_has_sve(vcpu) kvm_has_sve(kern_hyp_va((vcpu)->kvm)) #else #define vcpu_has_sve(vcpu) kvm_has_sve((vcpu)->kvm) #endif #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || \ vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC))) #else #define vcpu_has_ptrauth(vcpu) false #endif #define vcpu_on_unsupported_cpu(vcpu) \ vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_set_on_unsupported_cpu(vcpu) \ vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_clear_on_unsupported_cpu(vcpu) \ vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) /* * Only use __vcpu_sys_reg/ctxt_sys_reg if you know you want the * memory backed version of a register, and not the one most recently * accessed by a running VCPU. For example, for userspace access or * for system registers that are never context switched, but only * emulated. * * Don't bother with VNCR-based accesses in the nVHE code, it has no * business dealing with NV. */ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) { #if !defined (__KVM_NVHE_HYPERVISOR__) if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && r >= __VNCR_START__ && ctxt->vncr_array)) return &ctxt->vncr_array[r - __VNCR_START__]; #endif return (u64 *)&ctxt->sys_regs[r]; } #define __ctxt_sys_reg(c,r) \ ({ \ BUILD_BUG_ON(__builtin_constant_p(r) && \ (r) >= NR_SYS_REGS); \ ___ctxt_sys_reg(c, r); \ }) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); #define __vcpu_assign_sys_reg(v, r, val) \ do { \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 __v = (val); \ if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ \ ctxt_sys_reg(ctxt, (r)) = __v; \ } while (0) #define __vcpu_rmw_sys_reg(v, r, op, val) \ do { \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 __v = ctxt_sys_reg(ctxt, (r)); \ __v op (val); \ if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ \ ctxt_sys_reg(ctxt, (r)) = __v; \ } while (0) #define __vcpu_sys_reg(v,r) \ ({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 __v = ctxt_sys_reg(ctxt, (r)); \ if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ __v; \ }) u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) { /* * *** VHE ONLY *** * * System registers listed in the switch are not saved on every * exit from the guest but are only saved on vcpu_put. * * SYSREGS_ON_CPU *MUST* be checked before using this helper. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but * should never be listed below, because the guest cannot modify its * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's * thread when emulating cross-VCPU communication. */ if (!has_vhe()) return false; switch (reg) { case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; case TCR2_EL1: *val = read_sysreg_s(SYS_TCR2_EL12); break; case PIR_EL1: *val = read_sysreg_s(SYS_PIR_EL12); break; case PIRE0_EL1: *val = read_sysreg_s(SYS_PIRE0_EL12); break; case POR_EL1: *val = read_sysreg_s(SYS_POR_EL12); break; case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; case SPSR_EL1: *val = read_sysreg_s(SYS_SPSR_EL12); break; case PAR_EL1: *val = read_sysreg_par(); break; case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; case ZCR_EL1: *val = read_sysreg_s(SYS_ZCR_EL12); break; case SCTLR2_EL1: *val = read_sysreg_s(SYS_SCTLR2_EL12); break; default: return false; } return true; } static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) { /* * *** VHE ONLY *** * * System registers listed in the switch are not restored on every * entry to the guest but are only restored on vcpu_load. * * SYSREGS_ON_CPU *MUST* be checked before using this helper. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but * should never be listed below, because the MPIDR should only be set * once, before running the VCPU, and never changed later. */ if (!has_vhe()) return false; switch (reg) { case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; case ZCR_EL1: write_sysreg_s(val, SYS_ZCR_EL12); break; case SCTLR2_EL1: write_sysreg_s(val, SYS_SCTLR2_EL12); break; default: return false; } return true; } struct kvm_vm_stat { struct kvm_vm_stat_generic generic; }; struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; u64 signal_exits; u64 exits; }; unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); #define vcpu_has_run_once(vcpu) (!!READ_ONCE((vcpu)->pid)) #ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ ({ \ struct arm_smccc_res res; \ \ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ ##__VA_ARGS__, &res); \ WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ \ res.a1; \ }) /* * The isb() below is there to guarantee the same behaviour on VHE as on !VHE, * where the eret to EL1 acts as a context synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ if (has_vhe()) { \ f(__VA_ARGS__); \ isb(); \ } else { \ kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ } while(0) #define kvm_call_hyp_ret(f, ...) \ ({ \ typeof(f(__VA_ARGS__)) ret; \ \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ \ ret; \ }) #else /* __KVM_NVHE_HYPERVISOR__ */ #define kvm_call_hyp(f, ...) f(__VA_ARGS__) #define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__) #define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__) #endif /* __KVM_NVHE_HYPERVISOR__ */ int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_sys_regs_create_debugfs(struct kvm *kvm); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); int __init kvm_sys_reg_table_init(void); struct sys_reg_desc; int __init populate_sysreg_config(const struct sys_reg_desc *sr, unsigned int idx); int __init populate_nv_trap_config(void); void kvm_calculate_traps(struct kvm_vcpu *vcpu); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); /* * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, * arrived in guest context. For arm64, any event that arrives while a vCPU is * loaded is considered to be "in guest". */ static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) { return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; } long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); bool kvm_arm_pvtime_supported(void); int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { vcpu_arch->steal.base = INVALID_GPA; } static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) { return (vcpu_arch->steal.base != INVALID_GPA); } struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); /* * How we access per-CPU host data depends on the where we access it from, * and the mode we're in: * * - VHE and nVHE hypervisor bits use their locally defined instance * * - the rest of the kernel use either the VHE or nVHE one, depending on * the mode we're running in. * * Unless we're in protected mode, fully deprivileged, and the nVHE * per-CPU stuff is exclusively accessible to the protected EL2 code. * In this case, the EL1 code uses the *VHE* data as its private state * (which makes sense in a way as there shouldn't be any shared state * between the host and the hypervisor). * * Yes, this is all totally trivial. Shoot me now. */ #if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) #define host_data_ptr(f) (&this_cpu_ptr(&kvm_host_data)->f) #else #define host_data_ptr(f) \ (static_branch_unlikely(&kvm_protected_mode_initialized) ? \ &this_cpu_ptr(&kvm_host_data)->f : \ &this_cpu_ptr_hyp_sym(kvm_host_data)->f) #endif #define host_data_test_flag(flag) \ (test_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags))) #define host_data_set_flag(flag) \ set_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags)) #define host_data_clear_flag(flag) \ clear_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags)) /* Check whether the FP regs are owned by the guest */ static inline bool guest_owns_fp_regs(void) { return *host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED; } /* Check whether the FP regs are owned by the host */ static inline bool host_owns_fp_regs(void) { return *host_data_ptr(fp_owner) == FP_STATE_HOST_OWNED; } static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { /* The host's MPIDR is immutable, so let's set it up at boot time */ ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); } static inline bool kvm_system_needs_idmapped_vectors(void) { return cpus_have_final_cap(ARM64_SPECTRE_V3A); } void kvm_init_host_debug_data(void); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val); #define kvm_vcpu_os_lock_enabled(vcpu) \ (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK)) #define kvm_debug_regs_in_use(vcpu) \ ((vcpu)->arch.debug_owner != VCPU_DEBUG_FREE) #define kvm_host_owns_debug_regs(vcpu) \ ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED) #define kvm_guest_owns_debug_regs(vcpu) \ ((vcpu)->arch.debug_owner == VCPU_DEBUG_GUEST_OWNED) int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags); int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, struct kvm_arm_counter_offset *offset); int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range); /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) { return (!has_vhe() && attr->exclude_host); } #ifdef CONFIG_KVM void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); void kvm_enable_trbe(void); void kvm_disable_trbe(void); void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest); #else static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u64 clr) {} static inline bool kvm_set_pmuserenr(u64 val) { return false; } static inline void kvm_enable_trbe(void) {} static inline void kvm_disable_trbe(void) {} static inline void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) {} #endif void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu); int __init kvm_set_ipa_limit(void); u32 kvm_get_pa_bits(struct kvm *kvm); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE #define kvm_vm_is_protected(kvm) (is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled) #define vcpu_is_protected(vcpu) kvm_vm_is_protected((vcpu)->kvm) int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) #define kvm_has_mte(kvm) \ (system_supports_mte() && \ test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags)) #define kvm_supports_32bit_el0() \ (system_supports_32bit_el0() && \ !static_branch_unlikely(&arm64_mismatched_32bit_el0)) #define kvm_vm_has_ran_once(kvm) \ (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags)) static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) { return test_bit(feature, ka->vcpu_features); } #define kvm_vcpu_has_feature(k, f) __vcpu_has_feature(&(k)->arch, (f)) #define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) #define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED) int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; extern phys_addr_t hyp_mem_size; void __init kvm_hyp_reserve(void); #else static inline void kvm_hyp_reserve(void) { } #endif void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg) { switch (reg) { case sys_reg(3, 0, 0, 1, 0) ... sys_reg(3, 0, 0, 7, 7): return &ka->id_regs[IDREG_IDX(reg)]; case SYS_CTR_EL0: return &ka->ctr_el0; case SYS_MIDR_EL1: return &ka->midr_el1; case SYS_REVIDR_EL1: return &ka->revidr_el1; case SYS_AIDR_EL1: return &ka->aidr_el1; default: WARN_ON_ONCE(1); return NULL; } } #define kvm_read_vm_id_reg(kvm, reg) \ ({ u64 __val = *__vm_id_reg(&(kvm)->arch, reg); __val; }) void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define __expand_field_sign_unsigned(id, fld, val) \ ((u64)SYS_FIELD_VALUE(id, fld, val)) #define __expand_field_sign_signed(id, fld, val) \ ({ \ u64 __val = SYS_FIELD_VALUE(id, fld, val); \ sign_extend64(__val, id##_##fld##_WIDTH - 1); \ }) #define get_idreg_field_unsigned(kvm, id, fld) \ ({ \ u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id); \ FIELD_GET(id##_##fld##_MASK, __val); \ }) #define get_idreg_field_signed(kvm, id, fld) \ ({ \ u64 __val = get_idreg_field_unsigned(kvm, id, fld); \ sign_extend64(__val, id##_##fld##_WIDTH - 1); \ }) #define get_idreg_field_enum(kvm, id, fld) \ get_idreg_field_unsigned(kvm, id, fld) #define kvm_cmp_feat_signed(kvm, id, fld, op, limit) \ (get_idreg_field_signed((kvm), id, fld) op __expand_field_sign_signed(id, fld, limit)) #define kvm_cmp_feat_unsigned(kvm, id, fld, op, limit) \ (get_idreg_field_unsigned((kvm), id, fld) op __expand_field_sign_unsigned(id, fld, limit)) #define kvm_cmp_feat(kvm, id, fld, op, limit) \ (id##_##fld##_SIGNED ? \ kvm_cmp_feat_signed(kvm, id, fld, op, limit) : \ kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)) #define __kvm_has_feat(kvm, id, fld, limit) \ kvm_cmp_feat(kvm, id, fld, >=, limit) #define kvm_has_feat(kvm, ...) __kvm_has_feat(kvm, __VA_ARGS__) #define __kvm_has_feat_enum(kvm, id, fld, val) \ kvm_cmp_feat_unsigned(kvm, id, fld, ==, val) #define kvm_has_feat_enum(kvm, ...) __kvm_has_feat_enum(kvm, __VA_ARGS__) #define kvm_has_feat_range(kvm, id, fld, min, max) \ (kvm_cmp_feat(kvm, id, fld, >=, min) && \ kvm_cmp_feat(kvm, id, fld, <=, max)) /* Check for a given level of PAuth support */ #define kvm_has_pauth(k, l) \ ({ \ bool pa, pi, pa3; \ \ pa = kvm_has_feat((k), ID_AA64ISAR1_EL1, APA, l); \ pa &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPA, IMP); \ pi = kvm_has_feat((k), ID_AA64ISAR1_EL1, API, l); \ pi &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPI, IMP); \ pa3 = kvm_has_feat((k), ID_AA64ISAR2_EL1, APA3, l); \ pa3 &= kvm_has_feat((k), ID_AA64ISAR2_EL1, GPA3, IMP); \ \ (pa + pi + pa3) == 1; \ }) #define kvm_has_fpmr(k) \ (system_supports_fpmr() && \ kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) #define kvm_has_tcr2(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, TCRX, IMP)) #define kvm_has_s1pie(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) #define kvm_has_s1poe(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) #define kvm_has_ras(k) \ (kvm_has_feat((k), ID_AA64PFR0_EL1, RAS, IMP)) #define kvm_has_sctlr2(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, SCTLRX, IMP)) static inline bool kvm_arch_has_irq_bypass(void) { return true; } void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt); void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1); void check_feature_map(void); #endif /* __ARM64_KVM_HOST_H__ */
928 928 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PAGE_EXT_H #define __LINUX_PAGE_EXT_H #include <linux/types.h> #include <linux/mmzone.h> #include <linux/stacktrace.h> struct pglist_data; #ifdef CONFIG_PAGE_EXTENSION /** * struct page_ext_operations - per page_ext client operations * @offset: Offset to the client's data within page_ext. Offset is returned to * the client by page_ext_init. * @size: The size of the client data within page_ext. * @need: Function that returns true if client requires page_ext. * @init: (optional) Called to initialize client once page_exts are allocated. * @need_shared_flags: True when client is using shared page_ext->flags * field. * * Each Page Extension client must define page_ext_operations in * page_ext_ops array. */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); bool need_shared_flags; }; /* * The page_ext_flags users must set need_shared_flags to true. */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif }; /* * Page Extension can be considered as an extended mem_map. * A page_ext page is associated with every page descriptor. The * page_ext helps us add more information about the page. * All page_ext are allocated at boot or memory hotplug event, * then the page_ext for pfn always exists. */ struct page_ext { unsigned long flags; }; extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); static inline bool early_page_ext_enabled(void) { return early_page_ext; } #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { /* * page_ext is allocated per memory section. Once we cross a * memory section, we have to fetch the new pointer. */ return next_pfn % PAGES_PER_SECTION; } #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { return true; } #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) { return (void *)(page_ext) + ops->offset; } static inline struct page_ext *page_ext_next(struct page_ext *curr) { void *next = curr; next += page_ext_size; return next; } struct page_ext_iter { unsigned long index; unsigned long start_pfn; struct page_ext *page_ext; }; /** * page_ext_iter_begin() - Prepare for iterating through page extensions. * @iter: page extension iterator. * @pfn: PFN of the page we're interested in. * * Must be called with RCU read lock taken. * * Return: NULL if no page_ext exists for this page. */ static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, unsigned long pfn) { iter->index = 0; iter->start_pfn = pfn; iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_next() - Get next page extension * @iter: page extension iterator. * * Must be called with RCU read lock taken. * * Return: NULL if no next page_ext exists. */ static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) { unsigned long pfn; if (WARN_ON_ONCE(!iter->page_ext)) return NULL; iter->index++; pfn = iter->start_pfn + iter->index; if (page_ext_iter_next_fast_possible(pfn)) iter->page_ext = page_ext_next(iter->page_ext); else iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_get() - Get current page extension * @iter: page extension iterator. * * Return: NULL if no page_ext exists for this iterator. */ static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) { return iter->page_ext; } /** * for_each_page_ext(): iterate through page_ext objects. * @__page: the page we're interested in * @__pgcount: how many pages to iterate through * @__page_ext: struct page_ext pointer where the current page_ext * object is returned * @__iter: struct page_ext_iter object (defined in the stack) * * IMPORTANT: must be called with RCU read lock taken. */ #define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ __page_ext && __iter.index < __pgcount; \ __page_ext = page_ext_iter_next(&__iter)) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; static inline bool early_page_ext_enabled(void) { return false; } static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } static inline void page_ext_init(void) { } static inline void page_ext_init_flatmem_late(void) { } static inline void page_ext_init_flatmem(void) { } static inline struct page_ext *page_ext_get(const struct page *page) { return NULL; } static inline void page_ext_put(struct page_ext *page_ext) { } #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */
56 1 54 1 54 2 1 2 52 39 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 - Linaro and Columbia University * Author: Jintack Lim <jintack.lim@linaro.org> */ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_nested.h> #include "hyp/include/hyp/adjust_pc.h" #include "trace.h" enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, /* Traps that take effect in Host EL0, this is rare! */ BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { const enum vcpu_sysreg index; const enum trap_behaviour behaviour; const u64 value; const u64 mask; }; /* Coarse Grained Trap definitions */ enum cgt_group_id { /* Indicates no coarse trap control */ __RESERVED__, /* * The first batch of IDs denote coarse trapping that are used * on their own instead of being part of a combination of * trap controls. */ CGT_HCR_TID1, CGT_HCR_TID2, CGT_HCR_TID3, CGT_HCR_IMO, CGT_HCR_FMO, CGT_HCR_TIDCP, CGT_HCR_TACR, CGT_HCR_TSW, CGT_HCR_TPC, CGT_HCR_TPU, CGT_HCR_TTLB, CGT_HCR_TVM, CGT_HCR_TDZ, CGT_HCR_TRVM, CGT_HCR_TLOR, CGT_HCR_TERR, CGT_HCR_APK, CGT_HCR_NV, CGT_HCR_NV_nNV2, CGT_HCR_NV1_nNV2, CGT_HCR_AT, CGT_HCR_nFIEN, CGT_HCR_TID4, CGT_HCR_TICAB, CGT_HCR_TOCU, CGT_HCR_ENSCXT, CGT_HCR_TTLBIS, CGT_HCR_TTLBOS, CGT_MDCR_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TDE, CGT_MDCR_TDA, CGT_MDCR_TDOSA, CGT_MDCR_TDRA, CGT_MDCR_E2PB, CGT_MDCR_TPMS, CGT_MDCR_TTRF, CGT_MDCR_E2TB, CGT_MDCR_TDCC, CGT_CPTR_TAM, CGT_CPTR_TCPAC, CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, CGT_HCRX_SCTLR2En, CGT_CNTHCTL_EL1TVT, CGT_CNTHCTL_EL1TVCT, CGT_ICH_HCR_TC, CGT_ICH_HCR_TALL0, CGT_ICH_HCR_TALL1, CGT_ICH_HCR_TDIR, /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, CGT_HCR_TVM_TRVM, CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, CGT_HCR_TPU_TICAB, CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, CGT_ICH_HCR_TC_TDIR, /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. */ __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, CGT_CNTHCTL_EL1NVPCT, CGT_CNTHCTL_EL1NVVCT, CGT_CPTR_TTA, CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ }; static const struct trap_bits coarse_trap_bits[] = { [CGT_HCR_TID1] = { .index = HCR_EL2, .value = HCR_TID1, .mask = HCR_TID1, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_TID2] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, .value = HCR_TID3, .mask = HCR_TID3, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_IMO] = { .index = HCR_EL2, .value = HCR_IMO, .mask = HCR_IMO, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_FMO] = { .index = HCR_EL2, .value = HCR_FMO, .mask = HCR_FMO, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_TIDCP] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, .value = HCR_TVM, .mask = HCR_TVM, .behaviour = BEHAVE_FORWARD_WRITE, }, [CGT_HCR_TDZ] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, .value = HCR_TRVM, .mask = HCR_TRVM, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_HCR_TLOR] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, .behaviour = BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, .behaviour = BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TAM] = { .index = CPTR_EL2, .value = CPTR_EL2_TAM, .mask = CPTR_EL2_TAM, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TCPAC] = { .index = CPTR_EL2, .value = CPTR_EL2_TCPAC, .mask = CPTR_EL2_TCPAC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_EnFPM] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_EnFPM, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_SCTLR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_SCTLR2En, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CNTHCTL_EL1TVT] = { .index = CNTHCTL_EL2, .value = CNTHCTL_EL1TVT, .mask = CNTHCTL_EL1TVT, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CNTHCTL_EL1TVCT] = { .index = CNTHCTL_EL2, .value = CNTHCTL_EL1TVCT, .mask = CNTHCTL_EL1TVCT, .behaviour = BEHAVE_FORWARD_READ, }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_EL2_TC, .mask = ICH_HCR_EL2_TC, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL0] = { .index = ICH_HCR_EL2, .value = ICH_HCR_EL2_TALL0, .mask = ICH_HCR_EL2_TALL0, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL1] = { .index = ICH_HCR_EL2, .value = ICH_HCR_EL2_TALL1, .mask = ICH_HCR_EL2_TALL1, .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TDIR] = { .index = ICH_HCR_EL2, .value = ICH_HCR_EL2_TDIR, .mask = ICH_HCR_EL2_TDIR, .behaviour = BEHAVE_FORWARD_RW, }, }; #define MCB(id, ...) \ [id - __MULTIPLE_CONTROL_BITS__] = \ (const enum cgt_group_id[]){ \ __VA_ARGS__, __RESERVED__ \ } static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), MCB(CGT_HCR_TVM_TRVM, CGT_HCR_TVM, CGT_HCR_TRVM), MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En, CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En), MCB(CGT_HCR_TVM_TRVM_HCRX_SCTLR2En, CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_SCTLR2En), MCB(CGT_HCR_TPU_TICAB, CGT_HCR_TPU, CGT_HCR_TICAB), MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); /* * Warning, maximum confusion ahead. * * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN * * Note the single letter difference? Yet, the bits have the same * function despite a different layout and a different name. * * We don't try to reconcile this mess. We just use the E2H=0 bits * to generate something that is in the E2H=1 format, and live with * it. You're welcome. */ static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10; return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10); } static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) { if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) { if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) { u64 val; val = __vcpu_sys_reg(vcpu, HCR_EL2); return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); } static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) { if (!is_nested_nv2_guest(vcpu) || !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) { if (!is_nested_nv2_guest(vcpu) || !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) return BEHAVE_HANDLE_LOCALLY; return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) val = translate_cptr_el2_to_cpacr_el1(val); if (val & CPACR_EL1_TTA) return BEHAVE_FORWARD_RW; return BEHAVE_HANDLE_LOCALLY; } static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) { u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); unsigned int idx; switch (sysreg) { case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); break; case SYS_PMXEVTYPER_EL0: case SYS_PMXEVCNTR_EL0: idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); break; default: /* Someone used this trap helper for something else... */ KVM_BUG_ON(1, vcpu->kvm); return BEHAVE_HANDLE_LOCALLY; } if (kvm_pmu_counter_is_hyp(vcpu, idx)) return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; return BEHAVE_HANDLE_LOCALLY; } #define CCC(id, fn) \ [id - __COMPLEX_CONDITIONS__] = fn static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), CCC(CGT_CPTR_TTA, check_cptr_tta), CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* * Bit assignment for the trap controls. We use a 64bit word with the * following layout for each trapped sysreg: * * [9:0] enum cgt_group_id (10 bits) * [13:10] enum fgt_group_id (4 bits) * [19:14] bit number in the FGT register (6 bits) * [20] trap polarity (1 bit) * [25:21] FG filter (5 bits) * [35:26] Main SysReg table index (10 bits) * [62:36] Unused (27 bits) * [63] RES0 - Must be zero, as lost on insertion in the xarray */ #define TC_CGT_BITS 10 #define TC_FGT_BITS 4 #define TC_FGF_BITS 5 #define TC_SRI_BITS 10 union trap_config { u64 val; struct { unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ unsigned long fgt:TC_FGT_BITS; /* Fine Grained Trap id */ unsigned long bit:6; /* Bit number */ unsigned long pol:1; /* Polarity */ unsigned long fgf:TC_FGF_BITS; /* Fine Grained Filter */ unsigned long sri:TC_SRI_BITS; /* SysReg Index */ unsigned long unused:27; /* Unused, should be zero */ unsigned long mbz:1; /* Must Be Zero */ }; }; struct encoding_to_trap_config { const u32 encoding; const u32 end; const union trap_config tc; const unsigned int line; }; /* * WARNING: using ranges is a treacherous endeavour, as sysregs that * are part of an architectural range are not necessarily contiguous * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully. */ #define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ { \ .encoding = sr_start, \ .end = sr_end, \ .tc = { \ .cgt = trap_id, \ }, \ .line = __LINE__, \ } #define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) /* * Map encoding to trap bits for exception reported with EC=0x18. * These must only be evaluated when running a nested hypervisor, but * that the current context is not a hypervisor context. When the * trapped access matches one of the trap controls, the exception is * re-injected in the nested hypervisor. */ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_REVIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_AIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_SMIDR_EL1, CGT_HCR_TID1), SR_TRAP(SYS_CTR_EL0, CGT_HCR_TID2), SR_TRAP(SYS_CCSIDR_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CCSIDR2_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CLIDR_EL1, CGT_HCR_TID2_TID4), SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0), sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0), sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0), sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0), sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0), sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0), sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0), sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0), sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0), sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0), sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0), sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0), sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0), sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0), sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP), SR_TRAP(SYS_ACTLR_EL1, CGT_HCR_TACR), SR_TRAP(SYS_DC_ISW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CISW, CGT_HCR_TSW), SR_TRAP(SYS_DC_IGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_IGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIGSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIGDSW, CGT_HCR_TSW), SR_TRAP(SYS_DC_CIVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CVADP, CGT_HCR_TPC), SR_TRAP(SYS_DC_IVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CIGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CIGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_IGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_IGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVAC, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVAP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGVADP, CGT_HCR_TPC), SR_TRAP(SYS_DC_CGDVADP, CGT_HCR_TPC), SR_TRAP(SYS_IC_IVAU, CGT_HCR_TPU_TOCU), SR_TRAP(SYS_IC_IALLU, CGT_HCR_TPU_TOCU), SR_TRAP(SYS_IC_IALLUIS, CGT_HCR_TPU_TICAB), SR_TRAP(SYS_DC_CVAU, CGT_HCR_TPU_TOCU), SR_TRAP(OP_TLBI_RVAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VMALLE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_ASIDE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAAE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAALE1, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VMALLE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_ASIDE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAAE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_VAALE1NXS, CGT_HCR_TTLB), SR_TRAP(OP_TLBI_RVAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_ASIDE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAAE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAALE1IS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_RVAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_ASIDE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAAE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VAALE1ISNXS, CGT_HCR_TTLB_TTLBIS), SR_TRAP(OP_TLBI_VMALLE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_ASIDE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAAE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAALE1OS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VMALLE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_ASIDE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_VAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAAE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(OP_TLBI_RVAALE1OSNXS, CGT_HCR_TTLB_TTLBOS), SR_TRAP(SYS_SCTLR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_SCTLR2_EL1, CGT_HCR_TVM_TRVM_HCRX_SCTLR2En), SR_TRAP(SYS_TTBR0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TTBR1_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_ESR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_FAR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AFSR0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AFSR1_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GZVA, CGT_HCR_TDZ), SR_TRAP(SYS_LORSA_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LOREA_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORN_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORC_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_LORID_EL1, CGT_HCR_TLOR), SR_TRAP(SYS_ERRIDR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERRSELR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXADDR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXCTLR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXFR_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC0_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC1_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC2_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXMISC3_EL1, CGT_HCR_TERR), SR_TRAP(SYS_ERXSTATUS_EL1, CGT_HCR_TERR), SR_TRAP(SYS_APIAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIAKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIBKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APIBKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDAKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDBKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APDBKEYHI_EL1, CGT_HCR_APK), SR_TRAP(SYS_APGAKEYLO_EL1, CGT_HCR_APK), SR_TRAP(SYS_APGAKEYHI_EL1, CGT_HCR_APK), /* All _EL2 registers */ SR_TRAP(SYS_BRBCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VMPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCTLR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ACTLR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCTLR2_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_HCR_EL2, SYS_HCRX_EL2, CGT_HCR_NV), SR_TRAP(SYS_SMPRIMAP_EL2, CGT_HCR_NV), SR_TRAP(SYS_SMCR_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_TTBR0_EL2, SYS_TCR2_EL2, CGT_HCR_NV), SR_TRAP(SYS_VTTBR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VTCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VNCR_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_HDFGRTR_EL2, SYS_HAFGRTR_EL2, CGT_HCR_NV), /* Skip the SP_EL1 encoding... */ SR_TRAP(SYS_SPSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ELR_EL2, CGT_HCR_NV), /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */ SR_TRAP(SYS_AFSR0_EL2, CGT_HCR_NV), SR_TRAP(SYS_AFSR1_EL2, CGT_HCR_NV), SR_TRAP(SYS_ESR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VSESR_EL2, CGT_HCR_NV), SR_TRAP(SYS_TFSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_FAR_EL2, CGT_HCR_NV), SR_TRAP(SYS_HPFAR_EL2, CGT_HCR_NV), SR_TRAP(SYS_PMSCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MAIR_EL2, CGT_HCR_NV), SR_TRAP(SYS_AMAIR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAMHCR_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAMVPMV_EL2, CGT_HCR_NV), SR_TRAP(SYS_MPAM2_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_MPAMVPM0_EL2, SYS_MPAMVPM7_EL2, CGT_HCR_NV), /* * Note that the spec. describes a group of MEC registers * whose access should not trap, therefore skip the following: * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2, * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2, * VMECID_P_EL2. */ SR_RANGE_TRAP(SYS_VBAR_EL2, SYS_RMR_EL2, CGT_HCR_NV), SR_TRAP(SYS_VDISR_EL2, CGT_HCR_NV), /* ICH_AP0R<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2, SYS_ICH_AP0R3_EL2, CGT_HCR_NV), /* ICH_AP1R<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2, SYS_ICH_AP1R3_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICC_SRE_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_ICH_HCR_EL2, SYS_ICH_EISR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICH_ELRSR_EL2, CGT_HCR_NV), SR_TRAP(SYS_ICH_VMCR_EL2, CGT_HCR_NV), /* ICH_LR<m>_EL2 */ SR_RANGE_TRAP(SYS_ICH_LR0_EL2, SYS_ICH_LR15_EL2, CGT_HCR_NV), SR_TRAP(SYS_CONTEXTIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_TPIDR_EL2, CGT_HCR_NV), SR_TRAP(SYS_SCXTNUM_EL2, CGT_HCR_NV), /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2 */ SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0), SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV), /* CNT*_EL2 */ SR_TRAP(SYS_CNTVOFF_EL2, CGT_HCR_NV), SR_TRAP(SYS_CNTPOFF_EL2, CGT_HCR_NV), SR_TRAP(SYS_CNTHCTL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2, SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1NXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1IS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2ISNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2OS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VAE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_ALLE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VALE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVAE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_TLBI_RVALE2OSNXS, CGT_HCR_NV), SR_TRAP(OP_CPP_RCTX, CGT_HCR_NV), SR_TRAP(OP_DVP_RCTX, CGT_HCR_NV), SR_TRAP(OP_CFP_RCTX, CGT_HCR_NV), SR_TRAP(SYS_SP_EL1, CGT_HCR_NV_nNV2), SR_TRAP(SYS_VBAR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_ELR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_SPSR_EL1, CGT_HCR_NV1_nNV2), SR_TRAP(SYS_SCXTNUM_EL1, CGT_HCR_NV1_nNV2_ENSCXT), SR_TRAP(SYS_SCXTNUM_EL0, CGT_HCR_ENSCXT), SR_TRAP(OP_AT_S1E1R, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E0R, CGT_HCR_AT), SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_PMCR_EL0, CGT_MDCR_TPM_TPMCR), SR_TRAP(SYS_PMCNTENSET_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCNTENCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMOVSSET_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_OSDTRRX_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_OSDTRTX_EL1, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_DBGDTR_EL0, CGT_MDCR_TDCC_TDE_TDA), /* * Also covers DBGDTRRX_EL0, which has the same encoding as * SYS_DBGDTRTX_EL0... */ SR_TRAP(SYS_DBGDTRTX_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDSCR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_OSECCR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBVRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGBCRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWVRn_EL1(15), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(0), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(1), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(2), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(3), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(4), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(5), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(6), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(7), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(8), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(9), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(10), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(11), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(12), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(13), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGWCRn_EL1(14), CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMSET_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGCLAIMCLR_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_DBGAUTHSTATUS_EL1, CGT_MDCR_TDE_TDA), SR_TRAP(SYS_OSLAR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_OSLSR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_OSDLR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_DBGPRCR_EL1, CGT_MDCR_TDE_TDOSA), SR_TRAP(SYS_MDRAR_EL1, CGT_MDCR_TDE_TDRA), SR_TRAP(SYS_PMBLIMITR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMBPTR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMBSR_EL1, CGT_MDCR_E2PB), SR_TRAP(SYS_PMSCR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSEVFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSFCR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSICR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSIDR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSIRR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSLATFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_PMSNEVFR_EL1, CGT_MDCR_TPMS), SR_TRAP(SYS_TRFCR_EL1, CGT_MDCR_TTRF), SR_TRAP(SYS_TRBBASER_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBLIMITR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBMAR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBPTR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBSR_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_TRBTRG_EL1, CGT_MDCR_E2TB), SR_TRAP(SYS_CPACR_EL1, CGT_CPTR_TCPAC), SR_TRAP(SYS_AMUSERENR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCFGR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCGCR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENCLR0_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENCLR1_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENSET0_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCNTENSET1_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMCR_EL0, CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR0_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(4), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(5), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(6), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(7), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(8), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(9), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(10), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(11), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(12), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVCNTR1_EL0(15), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER0_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(0), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(1), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(2), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(3), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(4), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(5), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(6), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(7), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(8), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(9), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(10), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(11), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(12), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), /* op0=2, op1=1, and CRn<0b1000 */ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), SR_TRAP(SYS_CNTP_TVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CVAL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), /* * IMPDEF choice: * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for * ICC_SRE_EL1 access, and always handle it locally. */ SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); enum fg_filter_id { __NO_FGF__, HCRX_FGTnXS, /* Must be last */ __NR_FG_FILTER_IDS__ }; #define __FGT(g, b, p, f) \ { \ .fgt = g ## _GROUP, \ .bit = g ## _EL2_ ## b ## _SHIFT, \ .pol = p, \ .fgf = f, \ } #define FGT(g, b, p) __FGT(g, b, p, __NO_FGF__) /* * See the warning next to SR_RANGE_TRAP(), and apply the same * level of caution. */ #define SR_FGF_RANGE(sr, e, g, b, p, f) \ { \ .encoding = sr, \ .end = e, \ .tc = __FGT(g, b, p, f), \ .line = __LINE__, \ } #define SR_FGF(sr, g, b, p, f) SR_FGF_RANGE(sr, sr, g, b, p, f) #define SR_FGT(sr, g, b, p) SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__) #define SR_FGT_RANGE(sr, end, g, b, p) \ SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__) static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ SR_FGT(SYS_AMAIR2_EL1, HFGRTR, nAMAIR2_EL1, 0), SR_FGT(SYS_MAIR2_EL1, HFGRTR, nMAIR2_EL1, 0), SR_FGT(SYS_S2POR_EL1, HFGRTR, nS2POR_EL1, 0), SR_FGT(SYS_POR_EL1, HFGRTR, nPOR_EL1, 0), SR_FGT(SYS_POR_EL0, HFGRTR, nPOR_EL0, 0), SR_FGT(SYS_PIR_EL1, HFGRTR, nPIR_EL1, 0), SR_FGT(SYS_PIRE0_EL1, HFGRTR, nPIRE0_EL1, 0), SR_FGT(SYS_RCWMASK_EL1, HFGRTR, nRCWMASK_EL1, 0), SR_FGT(SYS_TPIDR2_EL0, HFGRTR, nTPIDR2_EL0, 0), SR_FGT(SYS_SMPRI_EL1, HFGRTR, nSMPRI_EL1, 0), SR_FGT(SYS_GCSCR_EL1, HFGRTR, nGCS_EL1, 0), SR_FGT(SYS_GCSPR_EL1, HFGRTR, nGCS_EL1, 0), SR_FGT(SYS_GCSCRE0_EL1, HFGRTR, nGCS_EL0, 0), SR_FGT(SYS_GCSPR_EL0, HFGRTR, nGCS_EL0, 0), SR_FGT(SYS_ACCDATA_EL1, HFGRTR, nACCDATA_EL1, 0), SR_FGT(SYS_ERXADDR_EL1, HFGRTR, ERXADDR_EL1, 1), SR_FGT(SYS_ERXPFGCDN_EL1, HFGRTR, ERXPFGCDN_EL1, 1), SR_FGT(SYS_ERXPFGCTL_EL1, HFGRTR, ERXPFGCTL_EL1, 1), SR_FGT(SYS_ERXPFGF_EL1, HFGRTR, ERXPFGF_EL1, 1), SR_FGT(SYS_ERXMISC0_EL1, HFGRTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC1_EL1, HFGRTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC2_EL1, HFGRTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXMISC3_EL1, HFGRTR, ERXMISCn_EL1, 1), SR_FGT(SYS_ERXSTATUS_EL1, HFGRTR, ERXSTATUS_EL1, 1), SR_FGT(SYS_ERXCTLR_EL1, HFGRTR, ERXCTLR_EL1, 1), SR_FGT(SYS_ERXFR_EL1, HFGRTR, ERXFR_EL1, 1), SR_FGT(SYS_ERRSELR_EL1, HFGRTR, ERRSELR_EL1, 1), SR_FGT(SYS_ERRIDR_EL1, HFGRTR, ERRIDR_EL1, 1), SR_FGT(SYS_ICC_IGRPEN0_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), SR_FGT(SYS_ICC_IGRPEN1_EL1, HFGRTR, ICC_IGRPENn_EL1, 1), SR_FGT(SYS_VBAR_EL1, HFGRTR, VBAR_EL1, 1), SR_FGT(SYS_TTBR1_EL1, HFGRTR, TTBR1_EL1, 1), SR_FGT(SYS_TTBR0_EL1, HFGRTR, TTBR0_EL1, 1), SR_FGT(SYS_TPIDR_EL0, HFGRTR, TPIDR_EL0, 1), SR_FGT(SYS_TPIDRRO_EL0, HFGRTR, TPIDRRO_EL0, 1), SR_FGT(SYS_TPIDR_EL1, HFGRTR, TPIDR_EL1, 1), SR_FGT(SYS_TCR_EL1, HFGRTR, TCR_EL1, 1), SR_FGT(SYS_TCR2_EL1, HFGRTR, TCR_EL1, 1), SR_FGT(SYS_SCXTNUM_EL0, HFGRTR, SCXTNUM_EL0, 1), SR_FGT(SYS_SCXTNUM_EL1, HFGRTR, SCXTNUM_EL1, 1), SR_FGT(SYS_SCTLR_EL1, HFGRTR, SCTLR_EL1, 1), SR_FGT(SYS_SCTLR2_EL1, HFGRTR, SCTLR_EL1, 1), SR_FGT(SYS_REVIDR_EL1, HFGRTR, REVIDR_EL1, 1), SR_FGT(SYS_PAR_EL1, HFGRTR, PAR_EL1, 1), SR_FGT(SYS_MPIDR_EL1, HFGRTR, MPIDR_EL1, 1), SR_FGT(SYS_MIDR_EL1, HFGRTR, MIDR_EL1, 1), SR_FGT(SYS_MAIR_EL1, HFGRTR, MAIR_EL1, 1), SR_FGT(SYS_LORSA_EL1, HFGRTR, LORSA_EL1, 1), SR_FGT(SYS_LORN_EL1, HFGRTR, LORN_EL1, 1), SR_FGT(SYS_LORID_EL1, HFGRTR, LORID_EL1, 1), SR_FGT(SYS_LOREA_EL1, HFGRTR, LOREA_EL1, 1), SR_FGT(SYS_LORC_EL1, HFGRTR, LORC_EL1, 1), SR_FGT(SYS_ISR_EL1, HFGRTR, ISR_EL1, 1), SR_FGT(SYS_FAR_EL1, HFGRTR, FAR_EL1, 1), SR_FGT(SYS_ESR_EL1, HFGRTR, ESR_EL1, 1), SR_FGT(SYS_DCZID_EL0, HFGRTR, DCZID_EL0, 1), SR_FGT(SYS_CTR_EL0, HFGRTR, CTR_EL0, 1), SR_FGT(SYS_CSSELR_EL1, HFGRTR, CSSELR_EL1, 1), SR_FGT(SYS_CPACR_EL1, HFGRTR, CPACR_EL1, 1), SR_FGT(SYS_CONTEXTIDR_EL1, HFGRTR, CONTEXTIDR_EL1, 1), SR_FGT(SYS_CLIDR_EL1, HFGRTR, CLIDR_EL1, 1), SR_FGT(SYS_CCSIDR_EL1, HFGRTR, CCSIDR_EL1, 1), SR_FGT(SYS_APIBKEYLO_EL1, HFGRTR, APIBKey, 1), SR_FGT(SYS_APIBKEYHI_EL1, HFGRTR, APIBKey, 1), SR_FGT(SYS_APIAKEYLO_EL1, HFGRTR, APIAKey, 1), SR_FGT(SYS_APIAKEYHI_EL1, HFGRTR, APIAKey, 1), SR_FGT(SYS_APGAKEYLO_EL1, HFGRTR, APGAKey, 1), SR_FGT(SYS_APGAKEYHI_EL1, HFGRTR, APGAKey, 1), SR_FGT(SYS_APDBKEYLO_EL1, HFGRTR, APDBKey, 1), SR_FGT(SYS_APDBKEYHI_EL1, HFGRTR, APDBKey, 1), SR_FGT(SYS_APDAKEYLO_EL1, HFGRTR, APDAKey, 1), SR_FGT(SYS_APDAKEYHI_EL1, HFGRTR, APDAKey, 1), SR_FGT(SYS_AMAIR_EL1, HFGRTR, AMAIR_EL1, 1), SR_FGT(SYS_AIDR_EL1, HFGRTR, AIDR_EL1, 1), SR_FGT(SYS_AFSR1_EL1, HFGRTR, AFSR1_EL1, 1), SR_FGT(SYS_AFSR0_EL1, HFGRTR, AFSR0_EL1, 1), /* HFGRTR2_EL2, HFGWTR2_EL2 */ SR_FGT(SYS_ACTLRALIAS_EL1, HFGRTR2, nACTLRALIAS_EL1, 0), SR_FGT(SYS_ACTLRMASK_EL1, HFGRTR2, nACTLRMASK_EL1, 0), SR_FGT(SYS_CPACRALIAS_EL1, HFGRTR2, nCPACRALIAS_EL1, 0), SR_FGT(SYS_CPACRMASK_EL1, HFGRTR2, nCPACRMASK_EL1, 0), SR_FGT(SYS_PFAR_EL1, HFGRTR2, nPFAR_EL1, 0), SR_FGT(SYS_RCWSMASK_EL1, HFGRTR2, nRCWSMASK_EL1, 0), SR_FGT(SYS_SCTLR2ALIAS_EL1, HFGRTR2, nSCTLRALIAS2_EL1, 0), SR_FGT(SYS_SCTLR2MASK_EL1, HFGRTR2, nSCTLR2MASK_EL1, 0), SR_FGT(SYS_SCTLRALIAS_EL1, HFGRTR2, nSCTLRALIAS_EL1, 0), SR_FGT(SYS_SCTLRMASK_EL1, HFGRTR2, nSCTLRMASK_EL1, 0), SR_FGT(SYS_TCR2ALIAS_EL1, HFGRTR2, nTCR2ALIAS_EL1, 0), SR_FGT(SYS_TCR2MASK_EL1, HFGRTR2, nTCR2MASK_EL1, 0), SR_FGT(SYS_TCRALIAS_EL1, HFGRTR2, nTCRALIAS_EL1, 0), SR_FGT(SYS_TCRMASK_EL1, HFGRTR2, nTCRMASK_EL1, 0), SR_FGT(SYS_ERXGSR_EL1, HFGRTR2, nERXGSR_EL1, 0), /* HFGITR_EL2 */ SR_FGT(OP_AT_S1E1A, HFGITR, ATS1E1A, 1), SR_FGT(OP_COSP_RCTX, HFGITR, COSPRCTX, 1), SR_FGT(OP_GCSPUSHX, HFGITR, nGCSEPP, 0), SR_FGT(OP_GCSPOPX, HFGITR, nGCSEPP, 0), SR_FGT(OP_GCSPUSHM, HFGITR, nGCSPUSHM_EL1, 0), SR_FGT(OP_BRB_IALL, HFGITR, nBRBIALL, 0), SR_FGT(OP_BRB_INJ, HFGITR, nBRBINJ, 0), SR_FGT(SYS_DC_CVAC, HFGITR, DCCVAC, 1), SR_FGT(SYS_DC_CGVAC, HFGITR, DCCVAC, 1), SR_FGT(SYS_DC_CGDVAC, HFGITR, DCCVAC, 1), SR_FGT(OP_CPP_RCTX, HFGITR, CPPRCTX, 1), SR_FGT(OP_DVP_RCTX, HFGITR, DVPRCTX, 1), SR_FGT(OP_CFP_RCTX, HFGITR, CFPRCTX, 1), SR_FGT(OP_TLBI_VAALE1, HFGITR, TLBIVAALE1, 1), SR_FGT(OP_TLBI_VALE1, HFGITR, TLBIVALE1, 1), SR_FGT(OP_TLBI_VAAE1, HFGITR, TLBIVAAE1, 1), SR_FGT(OP_TLBI_ASIDE1, HFGITR, TLBIASIDE1, 1), SR_FGT(OP_TLBI_VAE1, HFGITR, TLBIVAE1, 1), SR_FGT(OP_TLBI_VMALLE1, HFGITR, TLBIVMALLE1, 1), SR_FGT(OP_TLBI_RVAALE1, HFGITR, TLBIRVAALE1, 1), SR_FGT(OP_TLBI_RVALE1, HFGITR, TLBIRVALE1, 1), SR_FGT(OP_TLBI_RVAAE1, HFGITR, TLBIRVAAE1, 1), SR_FGT(OP_TLBI_RVAE1, HFGITR, TLBIRVAE1, 1), SR_FGT(OP_TLBI_RVAALE1IS, HFGITR, TLBIRVAALE1IS, 1), SR_FGT(OP_TLBI_RVALE1IS, HFGITR, TLBIRVALE1IS, 1), SR_FGT(OP_TLBI_RVAAE1IS, HFGITR, TLBIRVAAE1IS, 1), SR_FGT(OP_TLBI_RVAE1IS, HFGITR, TLBIRVAE1IS, 1), SR_FGT(OP_TLBI_VAALE1IS, HFGITR, TLBIVAALE1IS, 1), SR_FGT(OP_TLBI_VALE1IS, HFGITR, TLBIVALE1IS, 1), SR_FGT(OP_TLBI_VAAE1IS, HFGITR, TLBIVAAE1IS, 1), SR_FGT(OP_TLBI_ASIDE1IS, HFGITR, TLBIASIDE1IS, 1), SR_FGT(OP_TLBI_VAE1IS, HFGITR, TLBIVAE1IS, 1), SR_FGT(OP_TLBI_VMALLE1IS, HFGITR, TLBIVMALLE1IS, 1), SR_FGT(OP_TLBI_RVAALE1OS, HFGITR, TLBIRVAALE1OS, 1), SR_FGT(OP_TLBI_RVALE1OS, HFGITR, TLBIRVALE1OS, 1), SR_FGT(OP_TLBI_RVAAE1OS, HFGITR, TLBIRVAAE1OS, 1), SR_FGT(OP_TLBI_RVAE1OS, HFGITR, TLBIRVAE1OS, 1), SR_FGT(OP_TLBI_VAALE1OS, HFGITR, TLBIVAALE1OS, 1), SR_FGT(OP_TLBI_VALE1OS, HFGITR, TLBIVALE1OS, 1), SR_FGT(OP_TLBI_VAAE1OS, HFGITR, TLBIVAAE1OS, 1), SR_FGT(OP_TLBI_ASIDE1OS, HFGITR, TLBIASIDE1OS, 1), SR_FGT(OP_TLBI_VAE1OS, HFGITR, TLBIVAE1OS, 1), SR_FGT(OP_TLBI_VMALLE1OS, HFGITR, TLBIVMALLE1OS, 1), /* nXS variants must be checked against HCRX_EL2.FGTnXS */ SR_FGF(OP_TLBI_VAALE1NXS, HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1NXS, HFGITR, TLBIVALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1NXS, HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1NXS, HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1NXS, HFGITR, TLBIVAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1NXS, HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1NXS, HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1NXS, HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1NXS, HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1NXS, HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1ISNXS, HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1ISNXS, HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1ISNXS, HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1ISNXS, HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAALE1ISNXS, HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1ISNXS, HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1ISNXS, HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1ISNXS, HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1ISNXS, HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1ISNXS, HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAALE1OSNXS, HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVALE1OSNXS, HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAAE1OSNXS, HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_RVAE1OSNXS, HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAALE1OSNXS, HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VALE1OSNXS, HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAAE1OSNXS, HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_ASIDE1OSNXS, HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VAE1OSNXS, HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS), SR_FGF(OP_TLBI_VMALLE1OSNXS, HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS), SR_FGT(OP_AT_S1E1WP, HFGITR, ATS1E1WP, 1), SR_FGT(OP_AT_S1E1RP, HFGITR, ATS1E1RP, 1), SR_FGT(OP_AT_S1E0W, HFGITR, ATS1E0W, 1), SR_FGT(OP_AT_S1E0R, HFGITR, ATS1E0R, 1), SR_FGT(OP_AT_S1E1W, HFGITR, ATS1E1W, 1), SR_FGT(OP_AT_S1E1R, HFGITR, ATS1E1R, 1), SR_FGT(SYS_DC_ZVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_GVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_GZVA, HFGITR, DCZVA, 1), SR_FGT(SYS_DC_CIVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CIGVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CIGDVAC, HFGITR, DCCIVAC, 1), SR_FGT(SYS_DC_CVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CGVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CGDVADP, HFGITR, DCCVADP, 1), SR_FGT(SYS_DC_CVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CGVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CGDVAP, HFGITR, DCCVAP, 1), SR_FGT(SYS_DC_CVAU, HFGITR, DCCVAU, 1), SR_FGT(SYS_DC_CISW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CIGSW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CIGDSW, HFGITR, DCCISW, 1), SR_FGT(SYS_DC_CSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_CGSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_CGDSW, HFGITR, DCCSW, 1), SR_FGT(SYS_DC_ISW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IGSW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IGDSW, HFGITR, DCISW, 1), SR_FGT(SYS_DC_IVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_DC_IGVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_DC_IGDVAC, HFGITR, DCIVAC, 1), SR_FGT(SYS_IC_IVAU, HFGITR, ICIVAU, 1), SR_FGT(SYS_IC_IALLU, HFGITR, ICIALLU, 1), SR_FGT(SYS_IC_IALLUIS, HFGITR, ICIALLUIS, 1), /* HFGITR2_EL2 */ SR_FGT(SYS_DC_CIGDVAPS, HFGITR2, nDCCIVAPS, 0), SR_FGT(SYS_DC_CIVAPS, HFGITR2, nDCCIVAPS, 0), /* HDFGRTR_EL2 */ SR_FGT(SYS_PMBIDR_EL1, HDFGRTR, PMBIDR_EL1, 1), SR_FGT(SYS_PMSNEVFR_EL1, HDFGRTR, nPMSNEVFR_EL1, 0), SR_FGT(SYS_BRBINF_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINF_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBINFINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRC_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBSRCINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(0), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(1), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(2), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(3), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(4), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(5), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(6), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(7), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(8), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(9), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(10), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(11), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(12), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(13), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(14), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(15), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(16), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(17), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(18), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(19), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(20), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(21), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(22), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(23), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(24), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(25), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(26), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(27), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(28), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(29), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(30), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGT_EL1(31), HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTGTINJ_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBTS_EL1, HDFGRTR, nBRBDATA, 0), SR_FGT(SYS_BRBCR_EL1, HDFGRTR, nBRBCTL, 0), SR_FGT(SYS_BRBFCR_EL1, HDFGRTR, nBRBCTL, 0), SR_FGT(SYS_BRBIDR0_EL1, HDFGRTR, nBRBIDR, 0), SR_FGT(SYS_PMCEID0_EL0, HDFGRTR, PMCEIDn_EL0, 1), SR_FGT(SYS_PMCEID1_EL0, HDFGRTR, PMCEIDn_EL0, 1), SR_FGT(SYS_PMUSERENR_EL0, HDFGRTR, PMUSERENR_EL0, 1), SR_FGT(SYS_TRBTRG_EL1, HDFGRTR, TRBTRG_EL1, 1), SR_FGT(SYS_TRBSR_EL1, HDFGRTR, TRBSR_EL1, 1), SR_FGT(SYS_TRBPTR_EL1, HDFGRTR, TRBPTR_EL1, 1), SR_FGT(SYS_TRBMAR_EL1, HDFGRTR, TRBMAR_EL1, 1), SR_FGT(SYS_TRBLIMITR_EL1, HDFGRTR, TRBLIMITR_EL1, 1), SR_FGT(SYS_TRBIDR_EL1, HDFGRTR, TRBIDR_EL1, 1), SR_FGT(SYS_TRBBASER_EL1, HDFGRTR, TRBBASER_EL1, 1), SR_FGT(SYS_TRCVICTLR, HDFGRTR, TRCVICTLR, 1), SR_FGT(SYS_TRCSTATR, HDFGRTR, TRCSTATR, 1), SR_FGT(SYS_TRCSSCSR(0), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(1), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(2), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(3), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(4), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(5), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(6), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSSCSR(7), HDFGRTR, TRCSSCSRn, 1), SR_FGT(SYS_TRCSEQSTR, HDFGRTR, TRCSEQSTR, 1), SR_FGT(SYS_TRCPRGCTLR, HDFGRTR, TRCPRGCTLR, 1), SR_FGT(SYS_TRCOSLSR, HDFGRTR, TRCOSLSR, 1), SR_FGT(SYS_TRCIMSPEC(0), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(1), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(2), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(3), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(4), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(5), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(6), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCIMSPEC(7), HDFGRTR, TRCIMSPECn, 1), SR_FGT(SYS_TRCDEVARCH, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCDEVID, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR0, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR1, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR2, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR3, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR4, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR5, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR6, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR7, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR8, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR9, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR10, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR11, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR12, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCIDR13, HDFGRTR, TRCID, 1), SR_FGT(SYS_TRCCNTVR(0), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(1), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(2), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCNTVR(3), HDFGRTR, TRCCNTVRn, 1), SR_FGT(SYS_TRCCLAIMCLR, HDFGRTR, TRCCLAIM, 1), SR_FGT(SYS_TRCCLAIMSET, HDFGRTR, TRCCLAIM, 1), SR_FGT(SYS_TRCAUXCTLR, HDFGRTR, TRCAUXCTLR, 1), SR_FGT(SYS_TRCAUTHSTATUS, HDFGRTR, TRCAUTHSTATUS, 1), SR_FGT(SYS_TRCACATR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACATR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCACVR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCBBCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCCCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCCTLR0, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCCTLR1, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCIDCVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTCTLR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCNTRLDVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCCONFIGR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEVENTCTL0R, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEVENTCTL1R, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCEXTINSELR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCQCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(8), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(9), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(10), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(11), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(12), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(13), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(14), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(15), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(16), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(17), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(18), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(19), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(20), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(21), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(22), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(23), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(24), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(25), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(26), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(27), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(28), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(29), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(30), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSCTLR(31), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCRSR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQEVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSEQRSTEVR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSCCR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSSPCICR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSTALLCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCSYNCPR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCTRACEIDR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCTSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVIIECTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVIPCSSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVISSCTLR, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCCTLR0, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCCTLR1, HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(0), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(1), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(2), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(3), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(4), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(5), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(6), HDFGRTR, TRC, 1), SR_FGT(SYS_TRCVMIDCVR(7), HDFGRTR, TRC, 1), SR_FGT(SYS_PMSLATFR_EL1, HDFGRTR, PMSLATFR_EL1, 1), SR_FGT(SYS_PMSIRR_EL1, HDFGRTR, PMSIRR_EL1, 1), SR_FGT(SYS_PMSIDR_EL1, HDFGRTR, PMSIDR_EL1, 1), SR_FGT(SYS_PMSICR_EL1, HDFGRTR, PMSICR_EL1, 1), SR_FGT(SYS_PMSFCR_EL1, HDFGRTR, PMSFCR_EL1, 1), SR_FGT(SYS_PMSEVFR_EL1, HDFGRTR, PMSEVFR_EL1, 1), SR_FGT(SYS_PMSCR_EL1, HDFGRTR, PMSCR_EL1, 1), SR_FGT(SYS_PMBSR_EL1, HDFGRTR, PMBSR_EL1, 1), SR_FGT(SYS_PMBPTR_EL1, HDFGRTR, PMBPTR_EL1, 1), SR_FGT(SYS_PMBLIMITR_EL1, HDFGRTR, PMBLIMITR_EL1, 1), SR_FGT(SYS_PMMIR_EL1, HDFGRTR, PMMIR_EL1, 1), SR_FGT(SYS_PMSELR_EL0, HDFGRTR, PMSELR_EL0, 1), SR_FGT(SYS_PMOVSCLR_EL0, HDFGRTR, PMOVS, 1), SR_FGT(SYS_PMOVSSET_EL0, HDFGRTR, PMOVS, 1), SR_FGT(SYS_PMINTENCLR_EL1, HDFGRTR, PMINTEN, 1), SR_FGT(SYS_PMINTENSET_EL1, HDFGRTR, PMINTEN, 1), SR_FGT(SYS_PMCNTENCLR_EL0, HDFGRTR, PMCNTEN, 1), SR_FGT(SYS_PMCNTENSET_EL0, HDFGRTR, PMCNTEN, 1), SR_FGT(SYS_PMCCNTR_EL0, HDFGRTR, PMCCNTR_EL0, 1), SR_FGT(SYS_PMCCFILTR_EL0, HDFGRTR, PMCCFILTR_EL0, 1), SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0), SYS_PMEVTYPERn_EL0(30), HDFGRTR, PMEVTYPERn_EL0, 1), SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0), SYS_PMEVCNTRn_EL0(30), HDFGRTR, PMEVCNTRn_EL0, 1), SR_FGT(SYS_OSDLR_EL1, HDFGRTR, OSDLR_EL1, 1), SR_FGT(SYS_OSECCR_EL1, HDFGRTR, OSECCR_EL1, 1), SR_FGT(SYS_OSLSR_EL1, HDFGRTR, OSLSR_EL1, 1), SR_FGT(SYS_DBGPRCR_EL1, HDFGRTR, DBGPRCR_EL1, 1), SR_FGT(SYS_DBGAUTHSTATUS_EL1, HDFGRTR, DBGAUTHSTATUS_EL1, 1), SR_FGT(SYS_DBGCLAIMSET_EL1, HDFGRTR, DBGCLAIM, 1), SR_FGT(SYS_DBGCLAIMCLR_EL1, HDFGRTR, DBGCLAIM, 1), SR_FGT(SYS_MDSCR_EL1, HDFGRTR, MDSCR_EL1, 1), /* * The trap bits capture *64* debug registers per bit, but the * ARM ARM only describes the encoding for the first 16, and * we don't really support more than that anyway. */ SR_FGT(SYS_DBGWVRn_EL1(0), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(1), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(2), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(3), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(4), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(5), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(6), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(7), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(8), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(9), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(10), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(11), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(12), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(13), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(14), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWVRn_EL1(15), HDFGRTR, DBGWVRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(0), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(1), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(2), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(3), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(4), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(5), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(6), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(7), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(8), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(9), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(10), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(11), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(12), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(13), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(14), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGWCRn_EL1(15), HDFGRTR, DBGWCRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(0), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(1), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(2), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(3), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(4), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(5), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(6), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(7), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(8), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(9), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(10), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(11), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(12), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(13), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(14), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBVRn_EL1(15), HDFGRTR, DBGBVRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(0), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(1), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(2), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(3), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(4), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(5), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(6), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(7), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(8), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(9), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(10), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(11), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(12), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(13), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(14), HDFGRTR, DBGBCRn_EL1, 1), SR_FGT(SYS_DBGBCRn_EL1(15), HDFGRTR, DBGBCRn_EL1, 1), /* HDFGRTR2_EL2 */ SR_FGT(SYS_MDSELR_EL1, HDFGRTR2, nMDSELR_EL1, 0), SR_FGT(SYS_MDSTEPOP_EL1, HDFGRTR2, nMDSTEPOP_EL1, 0), SR_FGT(SYS_PMCCNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0), SYS_PMEVCNTSVRn_EL1(30), HDFGRTR2, nPMSSDATA, 0), SR_FGT(SYS_PMICNTSVR_EL1, HDFGRTR2, nPMSSDATA, 0), SR_FGT(SYS_PMECR_EL1, HDFGRTR2, nPMECR_EL1, 0), SR_FGT(SYS_PMIAR_EL1, HDFGRTR2, nPMIAR_EL1, 0), SR_FGT(SYS_PMICFILTR_EL0, HDFGRTR2, nPMICFILTR_EL0, 0), SR_FGT(SYS_PMICNTR_EL0, HDFGRTR2, nPMICNTR_EL0, 0), SR_FGT(SYS_PMSSCR_EL1, HDFGRTR2, nPMSSCR_EL1, 0), SR_FGT(SYS_PMUACR_EL1, HDFGRTR2, nPMUACR_EL1, 0), SR_FGT(SYS_SPMACCESSR_EL1, HDFGRTR2, nSPMACCESSR_EL1, 0), SR_FGT(SYS_SPMCFGR_EL1, HDFGRTR2, nSPMID, 0), SR_FGT(SYS_SPMDEVARCH_EL1, HDFGRTR2, nSPMID, 0), SR_FGT(SYS_SPMCGCRn_EL1(0), HDFGRTR2, nSPMID, 0), SR_FGT(SYS_SPMCGCRn_EL1(1), HDFGRTR2, nSPMID, 0), SR_FGT(SYS_SPMIIDR_EL1, HDFGRTR2, nSPMID, 0), SR_FGT(SYS_SPMCNTENCLR_EL0, HDFGRTR2, nSPMCNTEN, 0), SR_FGT(SYS_SPMCNTENSET_EL0, HDFGRTR2, nSPMCNTEN, 0), SR_FGT(SYS_SPMCR_EL0, HDFGRTR2, nSPMCR_EL0, 0), SR_FGT(SYS_SPMDEVAFF_EL1, HDFGRTR2, nSPMDEVAFF_EL1, 0), /* * We have up to 64 of these registers in ranges of 16, banked via * SPMSELR_EL0.BANK. We're only concerned with the accessors here, * not the architectural registers. */ SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0), SYS_SPMEVCNTRn_EL0(15), HDFGRTR2, nSPMEVCNTRn_EL0, 0), SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0), SYS_SPMEVFILT2Rn_EL0(15), HDFGRTR2, nSPMEVTYPERn_EL0, 0), SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0), SYS_SPMEVFILTRn_EL0(15), HDFGRTR2, nSPMEVTYPERn_EL0, 0), SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0), SYS_SPMEVTYPERn_EL0(15), HDFGRTR2, nSPMEVTYPERn_EL0, 0), SR_FGT(SYS_SPMINTENCLR_EL1, HDFGRTR2, nSPMINTEN, 0), SR_FGT(SYS_SPMINTENSET_EL1, HDFGRTR2, nSPMINTEN, 0), SR_FGT(SYS_SPMOVSCLR_EL0, HDFGRTR2, nSPMOVS, 0), SR_FGT(SYS_SPMOVSSET_EL0, HDFGRTR2, nSPMOVS, 0), SR_FGT(SYS_SPMSCR_EL1, HDFGRTR2, nSPMSCR_EL1, 0), SR_FGT(SYS_SPMSELR_EL0, HDFGRTR2, nSPMSELR_EL0, 0), SR_FGT(SYS_TRCITECR_EL1, HDFGRTR2, nTRCITECR_EL1, 0), SR_FGT(SYS_PMBMAR_EL1, HDFGRTR2, nPMBMAR_EL1, 0), SR_FGT(SYS_PMSDSFR_EL1, HDFGRTR2, nPMSDSFR_EL1, 0), SR_FGT(SYS_TRBMPAM_EL1, HDFGRTR2, nTRBMPAM_EL1, 0), /* * HDFGWTR_EL2 * * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely * overlap in their bit assignment, there are a number of bits * that are RES0 on one side, and an actual trap bit on the * other. The policy chosen here is to describe all the * read-side mappings, and only the write-side mappings that * differ from the read side, and the trap handler will pick * the correct shadow register based on the access type. * * Same model applies to the FEAT_FGT2 registers. */ SR_FGT(SYS_TRFCR_EL1, HDFGWTR, TRFCR_EL1, 1), SR_FGT(SYS_TRCOSLAR, HDFGWTR, TRCOSLAR, 1), SR_FGT(SYS_PMCR_EL0, HDFGWTR, PMCR_EL0, 1), SR_FGT(SYS_PMSWINC_EL0, HDFGWTR, PMSWINC_EL0, 1), SR_FGT(SYS_OSLAR_EL1, HDFGWTR, OSLAR_EL1, 1), /* HDFGWTR2_EL2 */ SR_FGT(SYS_PMZR_EL0, HDFGWTR2, nPMZR_EL0, 0), SR_FGT(SYS_SPMZR_EL0, HDFGWTR2, nSPMEVCNTRn_EL0, 0), /* * HAFGRTR_EL2 */ SR_FGT(SYS_AMEVTYPER1_EL0(15), HAFGRTR, AMEVTYPER115_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(14), HAFGRTR, AMEVTYPER114_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(13), HAFGRTR, AMEVTYPER113_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(12), HAFGRTR, AMEVTYPER112_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(11), HAFGRTR, AMEVTYPER111_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(10), HAFGRTR, AMEVTYPER110_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(9), HAFGRTR, AMEVTYPER19_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(8), HAFGRTR, AMEVTYPER18_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(7), HAFGRTR, AMEVTYPER17_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(6), HAFGRTR, AMEVTYPER16_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(5), HAFGRTR, AMEVTYPER15_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(4), HAFGRTR, AMEVTYPER14_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(3), HAFGRTR, AMEVTYPER13_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(2), HAFGRTR, AMEVTYPER12_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(1), HAFGRTR, AMEVTYPER11_EL0, 1), SR_FGT(SYS_AMEVTYPER1_EL0(0), HAFGRTR, AMEVTYPER10_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(15), HAFGRTR, AMEVCNTR115_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(14), HAFGRTR, AMEVCNTR114_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(13), HAFGRTR, AMEVCNTR113_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(12), HAFGRTR, AMEVCNTR112_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(11), HAFGRTR, AMEVCNTR111_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(10), HAFGRTR, AMEVCNTR110_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(9), HAFGRTR, AMEVCNTR19_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(8), HAFGRTR, AMEVCNTR18_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(7), HAFGRTR, AMEVCNTR17_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(6), HAFGRTR, AMEVCNTR16_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(5), HAFGRTR, AMEVCNTR15_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(4), HAFGRTR, AMEVCNTR14_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(3), HAFGRTR, AMEVCNTR13_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(2), HAFGRTR, AMEVCNTR12_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(1), HAFGRTR, AMEVCNTR11_EL0, 1), SR_FGT(SYS_AMEVCNTR1_EL0(0), HAFGRTR, AMEVCNTR10_EL0, 1), SR_FGT(SYS_AMCNTENCLR1_EL0, HAFGRTR, AMCNTEN1, 1), SR_FGT(SYS_AMCNTENSET1_EL0, HAFGRTR, AMCNTEN1, 1), SR_FGT(SYS_AMCNTENCLR0_EL0, HAFGRTR, AMCNTEN0, 1), SR_FGT(SYS_AMCNTENSET0_EL0, HAFGRTR, AMCNTEN0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(3), HAFGRTR, AMEVCNTR03_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(2), HAFGRTR, AMEVCNTR02_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(1), HAFGRTR, AMEVCNTR01_EL0, 1), SR_FGT(SYS_AMEVCNTR0_EL0(0), HAFGRTR, AMEVCNTR00_EL0, 1), }; /* * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table * isn't used for exception routing, but only as a promise that the * trap is handled somewhere else. */ static const union trap_config non_0x18_fgt[] __initconst = { FGT(HFGITR, PSBCSYNC, 1), FGT(HFGITR, nGCSSTR_EL1, 0), FGT(HFGITR, SVC_EL1, 1), FGT(HFGITR, SVC_EL0, 1), FGT(HFGITR, ERET, 1), FGT(HFGITR2, TSBCSYNC, 1), }; static union trap_config get_trap_config(u32 sysreg) { return (union trap_config) { .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), }; } static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, const char *type, int err) { kvm_err("%s line %d encoding range " "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", type, tc->line, sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), sys_reg_Op2(tc->encoding), sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), sys_reg_Op2(tc->end), err); } static u32 encoding_next(u32 encoding) { u8 op0, op1, crn, crm, op2; op0 = sys_reg_Op0(encoding); op1 = sys_reg_Op1(encoding); crn = sys_reg_CRn(encoding); crm = sys_reg_CRm(encoding); op2 = sys_reg_Op2(encoding); if (op2 < Op2_mask) return sys_reg(op0, op1, crn, crm, op2 + 1); if (crm < CRm_mask) return sys_reg(op0, op1, crn, crm + 1, 0); if (crn < CRn_mask) return sys_reg(op0, op1, crn + 1, 0, 0); if (op1 < Op1_mask) return sys_reg(op0, op1 + 1, 0, 0, 0); return sys_reg(op0 + 1, 0, 0, 0, 0); } #define FGT_MASKS(__n, __m) \ struct fgt_masks __n = { .str = #__m, .res0 = __m, } FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0); FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0); FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0); FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0); FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0); FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0); FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0); FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0); FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0); FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0); FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0); static __init bool aggregate_fgt(union trap_config tc) { struct fgt_masks *rmasks, *wmasks; switch (tc.fgt) { case HFGRTR_GROUP: rmasks = &hfgrtr_masks; wmasks = &hfgwtr_masks; break; case HDFGRTR_GROUP: rmasks = &hdfgrtr_masks; wmasks = &hdfgwtr_masks; break; case HAFGRTR_GROUP: rmasks = &hafgrtr_masks; wmasks = NULL; break; case HFGITR_GROUP: rmasks = &hfgitr_masks; wmasks = NULL; break; case HFGRTR2_GROUP: rmasks = &hfgrtr2_masks; wmasks = &hfgwtr2_masks; break; case HDFGRTR2_GROUP: rmasks = &hdfgrtr2_masks; wmasks = &hdfgwtr2_masks; break; case HFGITR2_GROUP: rmasks = &hfgitr2_masks; wmasks = NULL; break; } /* * A bit can be reserved in either the R or W register, but * not both. */ if ((BIT(tc.bit) & rmasks->res0) && (!wmasks || (BIT(tc.bit) & wmasks->res0))) return false; if (tc.pol) rmasks->mask |= BIT(tc.bit) & ~rmasks->res0; else rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0; if (wmasks) { if (tc.pol) wmasks->mask |= BIT(tc.bit) & ~wmasks->res0; else wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0; } return true; } static __init int check_fgt_masks(struct fgt_masks *masks) { unsigned long duplicate = masks->mask & masks->nmask; u64 res0 = masks->res0; int ret = 0; if (duplicate) { int i; for_each_set_bit(i, &duplicate, 64) { kvm_err("%s[%d] bit has both polarities\n", masks->str, i); } ret = -EINVAL; } masks->res0 = ~(masks->mask | masks->nmask); if (masks->res0 != res0) kvm_info("Implicit %s = %016llx, expecting %016llx\n", masks->str, masks->res0, res0); return ret; } static __init int check_all_fgt_masks(int ret) { static struct fgt_masks * const masks[] __initconst = { &hfgrtr_masks, &hfgwtr_masks, &hfgitr_masks, &hdfgrtr_masks, &hdfgwtr_masks, &hafgrtr_masks, &hfgrtr2_masks, &hfgwtr2_masks, &hfgitr2_masks, &hdfgrtr2_masks, &hdfgwtr2_masks, }; int err = 0; for (int i = 0; i < ARRAY_SIZE(masks); i++) err |= check_fgt_masks(masks[i]); return ret ?: err; } #define for_each_encoding_in(__x, __s, __e) \ for (u32 __x = __s; __x <= __e; __x = encoding_next(__x)) int __init populate_nv_trap_config(void) { int ret = 0; BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS)); BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS)); BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK); for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; void *prev; if (cgt->tc.val & BIT(63)) { kvm_err("CGT[%d] has MBZ bit set\n", i); ret = -EINVAL; } for_each_encoding_in(enc, cgt->encoding, cgt->end) { prev = xa_store(&sr_forward_xa, enc, xa_mk_value(cgt->tc.val), GFP_KERNEL); if (prev && !xa_is_err(prev)) { ret = -EINVAL; print_nv_trap_error(cgt, "Duplicate CGT", ret); } if (xa_is_err(prev)) { ret = xa_err(prev); print_nv_trap_error(cgt, "Failed CGT insertion", ret); } } } if (__HCRX_EL2_RES0 != HCRX_EL2_RES0) kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n", __HCRX_EL2_RES0, HCRX_EL2_RES0); kvm_info("nv: %ld coarse grained trap handlers\n", ARRAY_SIZE(encoding_to_cgt)); if (!cpus_have_final_cap(ARM64_HAS_FGT)) goto check_mcb; for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) { const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i]; union trap_config tc; void *prev; if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) { ret = -EINVAL; print_nv_trap_error(fgt, "Invalid FGT", ret); } for_each_encoding_in(enc, fgt->encoding, fgt->end) { tc = get_trap_config(enc); if (tc.fgt) { ret = -EINVAL; print_nv_trap_error(fgt, "Duplicate FGT", ret); } tc.val |= fgt->tc.val; prev = xa_store(&sr_forward_xa, enc, xa_mk_value(tc.val), GFP_KERNEL); if (xa_is_err(prev)) { ret = xa_err(prev); print_nv_trap_error(fgt, "Failed FGT insertion", ret); } if (!aggregate_fgt(tc)) { ret = -EINVAL; print_nv_trap_error(fgt, "FGT bit is reserved", ret); } } } for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) { if (!aggregate_fgt(non_0x18_fgt[i])) { ret = -EINVAL; kvm_err("non_0x18_fgt[%d] is reserved\n", i); } } ret = check_all_fgt_masks(ret); kvm_info("nv: %ld fine grained trap handlers\n", ARRAY_SIZE(encoding_to_fgt)); check_mcb: for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { const enum cgt_group_id *cgids; cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } } } if (ret) xa_destroy(&sr_forward_xa); return ret; } int __init populate_sysreg_config(const struct sys_reg_desc *sr, unsigned int idx) { union trap_config tc; u32 encoding; void *ret; /* * 0 is a valid value for the index, but not for the storage. * We'll store (idx+1), so check against an offset'd limit. */ if (idx >= (BIT(TC_SRI_BITS) - 1)) { kvm_err("sysreg %s (%d) out of range\n", sr->name, idx); return -EINVAL; } encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2); tc = get_trap_config(encoding); if (tc.sri) { kvm_err("sysreg %s (%d) duplicate entry (%d)\n", sr->name, idx - 1, tc.sri); return -EINVAL; } tc.sri = idx + 1; ret = xa_store(&sr_forward_xa, encoding, xa_mk_value(tc.val), GFP_KERNEL); return xa_err(ret); } static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, const struct trap_bits *tb) { enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; u64 val; val = __vcpu_sys_reg(vcpu, tb->index); if ((val & tb->mask) == tb->value) b |= tb->behaviour; return b; } static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, const enum cgt_group_id id, enum trap_behaviour b) { switch (id) { const enum cgt_group_id *cgids; case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: if (likely(id != __RESERVED__)) b |= get_behaviour(vcpu, &coarse_trap_bits[id]); break; case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: /* Yes, this is recursive. Don't do anything stupid. */ cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) b |= __compute_trap_behaviour(vcpu, cgids[i], b); break; default: if (ARRAY_SIZE(ccc)) b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); break; } return b; } static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, const union trap_config tc) { enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; return __compute_trap_behaviour(vcpu, tc.cgt, b); } static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) { struct kvm_sysreg_masks *masks; /* Only handle the VNCR-backed regs for now */ if (sr < __VNCR_START__) return 0; masks = kvm->arch.sysreg_masks; return masks->mask[sr - __VNCR_START__].res0; } static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr, const union trap_config tc) { struct kvm *kvm = vcpu->kvm; u64 val; /* * KVM doesn't know about any FGTs that apply to the host, and hopefully * that'll remain the case. */ if (is_hyp_ctxt(vcpu)) return false; val = __vcpu_sys_reg(vcpu, sr); if (tc.pol) return (val & BIT(tc.bit)); /* * FGTs with negative polarities are an absolute nightmare, as * we need to evaluate the bit in the light of the feature * that defines it. WTF were they thinking? * * So let's check if the bit has been earmarked as RES0, as * this indicates an unimplemented feature. */ if (val & BIT(tc.bit)) return false; return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit)); } bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) { enum vcpu_sysreg fgtreg; union trap_config tc; enum trap_behaviour b; bool is_read; u32 sysreg; u64 esr; esr = kvm_vcpu_get_esr(vcpu); sysreg = esr_sys64_to_sysreg(esr); is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; tc = get_trap_config(sysreg); /* * A value of 0 for the whole entry means that we know nothing * for this sysreg, and that it cannot be re-injected into the * nested hypervisor. In this situation, let's cut it short. */ if (!tc.val) goto local; /* * If a sysreg can be trapped using a FGT, first check whether we * trap for the purpose of forbidding the feature. In that case, * inject an UNDEF. */ if (tc.fgt != __NO_FGT_GROUP__ && (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) { kvm_inject_undefined(vcpu); return true; } /* * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ if (!vcpu_has_nv(vcpu)) goto local; /* * There are a few traps that take effect InHost, but are constrained * to EL0. Don't bother with computing the trap behaviour if the vCPU * isn't in EL0. */ if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { case __NO_FGT_GROUP__: break; case HFGRTR_GROUP: fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2; break; case HDFGRTR_GROUP: fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2; break; case HAFGRTR_GROUP: fgtreg = HAFGRTR_EL2; break; case HFGITR_GROUP: fgtreg = HFGITR_EL2; switch (tc.fgf) { u64 tmp; case __NO_FGF__: break; case HCRX_FGTnXS: tmp = __vcpu_sys_reg(vcpu, HCRX_EL2); if (tmp & HCRX_EL2_FGTnXS) tc.fgt = __NO_FGT_GROUP__; } break; case HFGRTR2_GROUP: fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2; break; case HDFGRTR2_GROUP: fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2; break; case HFGITR2_GROUP: fgtreg = HFGITR2_EL2; break; default: /* Something is really wrong, bail out */ WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n", sysreg, tc.val); goto local; } if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) goto local; if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; local: if (!tc.sri) { struct sys_reg_params params; params = esr_sys64_to_params(esr); /* * Check for the IMPDEF range, as per DDI0487 J.a, * D18.3.2 Reserved encodings for IMPLEMENTATION * DEFINED registers. */ if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011)) print_sys_reg_msg(&params, "Unsupported guest access at: %lx\n", *vcpu_pc(vcpu)); kvm_inject_undefined(vcpu); return true; } *sr_index = tc.sri - 1; return false; inject: trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; } static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) { if (is_nested_ctxt(vcpu) && (__vcpu_sys_reg(vcpu, reg) & control_bit)) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; } return false; } static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) { return __forward_traps(vcpu, HCR_EL2, control_bit); } bool forward_smc_trap(struct kvm_vcpu *vcpu) { return forward_hcr_traps(vcpu, HCR_TSC); } static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) { return __forward_traps(vcpu, MDCR_EL2, control_bit); } bool forward_debug_exception(struct kvm_vcpu *vcpu) { return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); } static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) { u64 mode = spsr & PSR_MODE_MASK; /* * Possible causes for an Illegal Exception Return from EL2: * - trying to return to EL3 * - trying to return to an illegal M value * - trying to return to a 32bit EL * - trying to return to EL1 with HCR_EL2.TGE set */ if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h || mode == 0b00001 || (mode & BIT(1)) || (spsr & PSR_MODE32_BIT) || (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t || mode == PSR_MODE_EL1h))) { /* * The guest is playing with our nerves. Preserve EL, SP, * masks, flags from the existing PSTATE, and set IL. * The HW will then generate an Illegal State Exception * immediately after ERET. */ spsr = *vcpu_cpsr(vcpu); spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT | PSR_MODE_MASK | PSR_MODE32_BIT); spsr |= PSR_IL_BIT; } return spsr; } void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) { u64 spsr, elr, esr; spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); spsr = kvm_check_illegal_exception_return(vcpu, spsr); /* Check for an ERETAx */ esr = kvm_vcpu_get_esr(vcpu); if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) { /* * Oh no, ERETAx failed to authenticate. * * If we have FPACCOMBINE and we don't have a pending * Illegal Execution State exception (which has priority * over FPAC), deliver an exception right away. * * Otherwise, let the mangled ELR value trickle down the * ERET handling, and the guest will have a little surprise. */ if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) { esr &= ESR_ELx_ERET_ISS_ERETA; esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC); kvm_inject_nested_sync(vcpu, esr); return; } } preempt_disable(); vcpu_set_flag(vcpu, IN_NESTED_ERET); kvm_arch_vcpu_put(vcpu); if (!esr_iss_is_eretax(esr)) elr = __vcpu_sys_reg(vcpu, ELR_EL2); trace_kvm_nested_eret(vcpu, elr, spsr); *vcpu_pc(vcpu) = elr; *vcpu_cpsr(vcpu) = spsr; kvm_arch_vcpu_load(vcpu, smp_processor_id()); vcpu_clear_flag(vcpu, IN_NESTED_ERET); preempt_enable(); if (kvm_vcpu_has_pmu(vcpu)) kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, enum exception_type type) { trace_kvm_inject_nested_exception(vcpu, esr_el2, type); switch (type) { case except_type_sync: kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC); vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2); break; case except_type_irq: kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ); break; case except_type_serror: kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SERR); break; default: WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type); } } /* * Emulate taking an exception to EL2. * See ARM ARM J8.1.2 AArch64.TakeException() */ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, enum exception_type type) { u64 pstate, mode; bool direct_inject; if (!vcpu_has_nv(vcpu)) { kvm_err("Unexpected call to %s for the non-nesting configuration\n", __func__); return -EINVAL; } /* * As for ERET, we can avoid doing too much on the injection path by * checking that we either took the exception from a VHE host * userspace or from vEL2. In these cases, there is no change in * translation regime (or anything else), so let's do as little as * possible. */ pstate = *vcpu_cpsr(vcpu); mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT); direct_inject = (mode == PSR_MODE_EL0t && vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)); direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t); if (direct_inject) { kvm_inject_el2_exception(vcpu, esr_el2, type); return 1; } preempt_disable(); /* * We may have an exception or PC update in the EL0/EL1 context. * Commit it before entering EL2. */ __kvm_adjust_pc(vcpu); kvm_arch_vcpu_put(vcpu); kvm_inject_el2_exception(vcpu, esr_el2, type); /* * A hard requirement is that a switch between EL1 and EL2 * contexts has to happen between a put/load, so that we can * pick the correct timer and interrupt configuration, among * other things. * * Make sure the exception actually took place before we load * the new context. */ __kvm_adjust_pc(vcpu); kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); if (kvm_vcpu_has_pmu(vcpu)) kvm_pmu_nested_transition(vcpu); return 1; } int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2) { return kvm_inject_nested(vcpu, esr_el2, except_type_sync); } int kvm_inject_nested_irq(struct kvm_vcpu *vcpu) { /* * Do not inject an irq if the: * - Current exception level is EL2, and * - virtual HCR_EL2.TGE == 0 * - virtual HCR_EL2.IMO == 0 * * See Table D1-17 "Physical interrupt target and masking when EL3 is * not implemented and EL2 is implemented" in ARM DDI 0487C.a. */ if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) && !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO)) return 1; /* esr_el2 value doesn't matter for exits due to irqs. */ return kvm_inject_nested(vcpu, 0, except_type_irq); } int kvm_inject_nested_sea(struct kvm_vcpu *vcpu, bool iabt, u64 addr) { u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, iabt ? ESR_ELx_EC_IABT_LOW : ESR_ELx_EC_DABT_LOW); esr |= ESR_ELx_FSC_EXTABT | ESR_ELx_IL; vcpu_write_sys_reg(vcpu, FAR_EL2, addr); if (__vcpu_sys_reg(vcpu, SCTLR2_EL2) & SCTLR2_EL1_EASE) return kvm_inject_nested(vcpu, esr, except_type_serror); return kvm_inject_nested_sync(vcpu, esr); } int kvm_inject_nested_serror(struct kvm_vcpu *vcpu, u64 esr) { /* * Hardware sets up the EC field when propagating ESR as a result of * vSError injection. Manually populate EC for an emulated SError * exception. */ esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SERROR); return kvm_inject_nested(vcpu, esr, except_type_serror); }
54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> #include <uapi/linux/ioprio.h> /* * Default IO priority. */ #define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0) /* * Check that a priority value has a valid class. */ static inline bool ioprio_valid(unsigned short ioprio) { unsigned short class = IOPRIO_PRIO_CLASS(ioprio); return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; } /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (rt_or_dl_task_policy(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } #ifdef CONFIG_BLOCK /* * If the task has set an I/O priority, use that. Otherwise, return * the default I/O priority. * * Expected to be called for current task or with task_lock() held to keep * io_context stable. */ static inline int __get_task_ioprio(struct task_struct *p) { struct io_context *ioc = p->io_context; int prio; if (!ioc) return IOPRIO_DEFAULT; if (p != current) lockdep_assert_held(&p->alloc_lock); prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); return prio; } #else static inline int __get_task_ioprio(struct task_struct *p) { return IOPRIO_DEFAULT; } #endif /* CONFIG_BLOCK */ static inline int get_current_ioprio(void) { return __get_task_ioprio(current); } extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif
95 9 47 37 22 29 19 20 21 8 20 23 23 24 23 4 24 4 15 15 15 15 3 9 1 1 1 1 29 6 23 2 2 77 16 16 16 16 37 36 15 15 13 10 2 35 8 37 1 37 37 16 35 37 37 37 37 16 32 37 37 16 185 185 150 135 15 37 37 37 37 1 1 21 13 1 15 15 11 15 7 9 3 12 12 37 37 37 36 37 21 21 21 23 23 23 20 18 18 22 21 21 20 21 21 21 21 7 7 17 37 37 37 37 40 3 4 34 2 1 1 6 4 4 2 4 2 22 19 19 19 19 17 17 17 4 1 1 2 1 2 2 1 42 42 2 1 1 1 9 1 4 2 2 14 14 1 1 1 1 10 6 4 6 4 3 2 7 5 1 2 1 1 1 2 1 1 19 19 40 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 Linaro Ltd. * Author: Shannon Zhao <shannon.zhao@linaro.org> */ #include <linux/cpu.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/list.h> #include <linux/perf_event.h> #include <linux/perf/arm_pmu.h> #include <linux/uaccess.h> #include <asm/kvm_emulate.h> #include <kvm/arm_pmu.h> #include <kvm/arm_vgic.h> #define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0) static LIST_HEAD(arm_pmus); static DEFINE_MUTEX(arm_pmus_lock); static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc); bool kvm_supports_guest_pmuv3(void) { guard(mutex)(&arm_pmus_lock); return !list_empty(&arm_pmus); } static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); } static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) { return &vcpu->arch.pmu.pmc[cnt_idx]; } static u32 __kvm_pmu_event_mask(unsigned int pmuver) { switch (pmuver) { case ID_AA64DFR0_EL1_PMUVer_IMP: return GENMASK(9, 0); case ID_AA64DFR0_EL1_PMUVer_V3P1: case ID_AA64DFR0_EL1_PMUVer_V3P4: case ID_AA64DFR0_EL1_PMUVer_V3P5: case ID_AA64DFR0_EL1_PMUVer_V3P7: return GENMASK(15, 0); default: /* Shouldn't be here, just for sanity */ WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); return 0; } } static u32 kvm_pmu_event_mask(struct kvm *kvm) { u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); return __kvm_pmu_event_mask(pmuver); } u64 kvm_pmu_evtyper_mask(struct kvm *kvm) { u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | kvm_pmu_event_mask(kvm); if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) mask |= ARMV8_PMU_INCLUDE_EL2; if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | ARMV8_PMU_EXCLUDE_NS_EL1 | ARMV8_PMU_EXCLUDE_EL3; return mask; } /** * kvm_pmc_is_64bit - determine if counter is 64bit * @pmc: counter context */ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); return (pmc->idx == ARMV8_PMU_CYCLE_IDX || kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)); } static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 val = kvm_vcpu_read_pmcr(vcpu); if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); } static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc) { return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX && !kvm_pmc_has_64bit_overflow(pmc)); } static u32 counter_index_to_reg(u64 idx) { return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx; } static u32 counter_index_to_evtreg(u64 idx) { return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) { return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); } static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 counter, reg, enabled, running; reg = counter_index_to_reg(pmc->idx); counter = __vcpu_sys_reg(vcpu, reg); /* * The real counter value is equal to the value of counter register plus * the value perf event counts. */ if (pmc->perf_event) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); if (!kvm_pmc_is_64bit(pmc)) counter = lower_32_bits(counter); return counter; } /** * kvm_pmu_get_counter_value - get PMU counter value * @vcpu: The vcpu pointer * @select_idx: The counter index */ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) { return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); } static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 reg; kvm_pmu_release_perf_event(pmc); reg = counter_index_to_reg(pmc->idx); if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX && !force) { /* * Even with PMUv3p5, AArch32 cannot write to the top * 32bit of the counters. The only possible course of * action is to use PMCR.P, which will reset them to * 0 (the only use of the 'force' parameter). */ val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); val |= lower_32_bits(val); } __vcpu_assign_sys_reg(vcpu, reg, val); /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(pmc); } /** * kvm_pmu_set_counter_value - set PMU counter value * @vcpu: The vcpu pointer * @select_idx: The counter index * @val: The counter value */ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false); } /** * kvm_pmu_set_counter_value_user - set PMU counter value from user * @vcpu: The vcpu pointer * @select_idx: The counter index * @val: The counter value */ void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } /** * kvm_pmu_release_perf_event - remove the perf event * @pmc: The PMU counter pointer */ static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { perf_event_disable(pmc->perf_event); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; } } /** * kvm_pmu_stop_counter - stop PMU counter * @pmc: The PMU counter pointer * * If this counter has been configured to monitor some event, release it here. */ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 reg, val; if (!pmc->perf_event) return; val = kvm_pmu_get_pmc_value(pmc); reg = counter_index_to_reg(pmc->idx); __vcpu_assign_sys_reg(vcpu, reg, val); kvm_pmu_release_perf_event(pmc); } /** * kvm_pmu_vcpu_init - assign pmu counter idx for cpu * @vcpu: The vcpu pointer * */ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) { int i; struct kvm_pmu *pmu = &vcpu->arch.pmu; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) pmu->pmc[i].idx = i; } /** * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu * @vcpu: The vcpu pointer * */ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) { int i; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i)); irq_work_sync(&vcpu->arch.pmu.overflow_work); } static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) { unsigned int hpmn, n; if (!vcpu_has_nv(vcpu)) return 0; hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); n = vcpu->kvm->arch.nr_pmu_counters; /* * Programming HPMN to a value greater than PMCR_EL0.N is * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an * UNKNOWN number of counters (in our case, zero) are reserved for EL2. */ if (hpmn >= n) return 0; /* * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't * implemented. Since KVM's ability to emulate HPMN=0 does not directly * depend on hardware (all PMU registers are trapped), make the * implementation choice that all counters are included in the second * range reserved for EL2/EL3. */ return GENMASK(n - 1, hpmn); } bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); } u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) return mask; return mask & ~kvm_pmu_hyp_counter_mask(vcpu); } u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); } static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { if (!pmc->perf_event) { kvm_pmu_create_perf_event(pmc); return; } perf_event_enable(pmc->perf_event); if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE) kvm_debug("fail to enable perf event\n"); } static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) perf_event_disable(pmc->perf_event); } void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) { int i; if (!val) return; for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); if (!(val & BIT(i))) continue; if (kvm_pmu_counter_is_enabled(pmc)) kvm_pmc_enable_perf_event(pmc); else kvm_pmc_disable_perf_event(pmc); } kvm_vcpu_pmu_restore_guest(vcpu); } /* * Returns the PMU overflow state, which is true if there exists an event * counter where the values of the global enable control, PMOVSSET_EL0[n], and * PMINTENSET_EL1[n] are all 1. */ static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); /* * PMCR_EL0.E is the global enable control for event counters available * to EL0 and EL1. */ if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) reg &= kvm_pmu_hyp_counter_mask(vcpu); /* * Otherwise, MDCR_EL2.HPME is the global enable control for event * counters reserved for EL2. */ if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME)) reg &= ~kvm_pmu_hyp_counter_mask(vcpu); return reg; } static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow; overflow = kvm_pmu_overflow_status(vcpu); if (pmu->irq_level == overflow) return; pmu->irq_level = overflow; if (likely(irqchip_in_kernel(vcpu->kvm))) { int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, pmu->irq_num, overflow, pmu); WARN_ON(ret); } } bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_sync_regs *sregs = &vcpu->run->s.regs; bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; if (likely(irqchip_in_kernel(vcpu->kvm))) return false; return pmu->irq_level != run_level; } /* * Reflect the PMU overflow interrupt output level into the kvm_run structure */ void kvm_pmu_update_run(struct kvm_vcpu *vcpu) { struct kvm_sync_regs *regs = &vcpu->run->s.regs; /* Populate the timer bitmap for user space */ regs->device_irq_level &= ~KVM_ARM_DEV_PMU; if (vcpu->arch.pmu.irq_level) regs->device_irq_level |= KVM_ARM_DEV_PMU; } /** * kvm_pmu_flush_hwstate - flush pmu state to cpu * @vcpu: The vcpu pointer * * Check if the PMU has overflowed while we were running in the host, and inject * an interrupt if that was the case. */ void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) { kvm_pmu_update_state(vcpu); } /** * kvm_pmu_sync_hwstate - sync pmu state from cpu * @vcpu: The vcpu pointer * * Check if the PMU has overflowed while we were running in the guest, and * inject an interrupt if that was the case. */ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) { kvm_pmu_update_state(vcpu); } /* * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding * to the event. * This is why we need a callback to do it once outside of the NMI context. */ static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) { struct kvm_vcpu *vcpu; vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); kvm_vcpu_kick(vcpu); } /* * Perform an increment on any of the counters described in @mask, * generating the overflow if required, and propagate it as a chained * event if possible. */ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, unsigned long mask, u32 event) { int i; if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) return; /* Weed out disabled counters */ mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); u64 type, reg; /* Filter on event type */ type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i)); type &= kvm_pmu_event_mask(vcpu->kvm); if (type != event) continue; /* Increment this counter */ reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg); /* No overflow? move on */ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) continue; /* Mark overflow */ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), ARMV8_PMUV3_PERFCTR_CHAIN); } } /* Compute the sample period for a given counter value */ static u64 compute_period(struct kvm_pmc *pmc, u64 counter) { u64 val; if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc)) val = (-counter) & GENMASK(63, 0); else val = (-counter) & GENMASK(31, 0); return val; } /* * When the perf event overflows, set the overflow status and inform the vcpu. */ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, struct perf_sample_data *data, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); int idx = pmc->idx; u64 period; cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE); /* * Reset the sample period to the architectural limit, * i.e. the point where the counter overflows. */ period = compute_period(pmc, local64_read(&perf_event->count)); local64_set(&perf_event->hw.period_left, 0); perf_event->attr.sample_period = period; perf_event->hw.sample_period = period; __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), ARMV8_PMUV3_PERFCTR_CHAIN); if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); if (!in_nmi()) kvm_vcpu_kick(vcpu); else irq_work_queue(&vcpu->arch.pmu.overflow_work); } cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); } /** * kvm_pmu_software_increment - do software increment * @vcpu: The vcpu pointer * @val: the value guest writes to PMSWINC register */ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) { kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR); } /** * kvm_pmu_handle_pmcr - handle PMCR register * @vcpu: The vcpu pointer * @val: the value guest writes to PMCR register */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { int i; /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */ if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) val &= ~ARMV8_PMU_PMCR_LP; /* Request a reload of the PMU to enable/disable affected counters */ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); /* The reset bits don't indicate any state, and shouldn't be saved. */ __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P))); if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) & ~BIT(ARMV8_PMU_CYCLE_IDX); if (!vcpu_is_el2(vcpu)) mask &= ~kvm_pmu_hyp_counter_mask(vcpu); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); } } static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) return false; if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) return mdcr & MDCR_EL2_HPME; return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; } static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) { u64 evtreg = kvm_pmc_read_evtreg(pmc); bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; return u == nsu; } static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) { u64 evtreg = kvm_pmc_read_evtreg(pmc); bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; return p == nsk; } static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) return false; return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; } static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel) { struct arm_pmu *pmu = kvm->arch.arm_pmu; /* * The CPU PMU likely isn't PMUv3; let the driver provide a mapping * for the guest's PMUv3 event ID. */ if (unlikely(pmu->map_pmuv3_event)) return pmu->map_pmuv3_event(eventsel); return eventsel; } /** * kvm_pmu_create_perf_event - create a perf event for a counter * @pmc: Counter context */ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; int eventsel; u64 evtreg; evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed * by a perf event. */ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR || eventsel == ARMV8_PMUV3_PERFCTR_CHAIN) return; /* * If we have a filter in place and that the event isn't allowed, do * not install a perf event either. */ if (vcpu->kvm->arch.pmu_filter && !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; /* * Don't create an event if we're running on hardware that requires * PMUv3 event translation and we couldn't find a valid mapping. */ eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel); if (eventsel < 0) return; memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; /* * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the * guest's EL2 filter. */ if (unlikely(is_hyp_ctxt(vcpu))) attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); else attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. */ if (kvm_pmc_is_64bit(pmc)) attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT; attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc)); event = perf_event_create_kernel_counter(&attr, -1, current, kvm_pmu_perf_overflow, pmc); if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); return; } pmc->perf_event = event; } /** * kvm_pmu_set_counter_event_type - set selected counter to monitor some event * @vcpu: The vcpu pointer * @data: The data guest writes to PMXEVTYPER_EL0 * @select_idx: The number of selected counter * * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an * event with given hardware event number. Here we call perf_event API to * emulate this action and create a kernel perf event for it. */ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx); u64 reg; reg = counter_index_to_evtreg(pmc->idx); __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm))); kvm_pmu_create_perf_event(pmc); } void kvm_host_pmu_init(struct arm_pmu *pmu) { struct arm_pmu_entry *entry; /* * Check the sanitised PMU version for the system, as KVM does not * support implementations where PMUv3 exists on a subset of CPUs. */ if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) return; guard(mutex)(&arm_pmus_lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->arm_pmu = pmu; list_add_tail(&entry->entry, &arm_pmus); } static struct arm_pmu *kvm_pmu_probe_armpmu(void) { struct arm_pmu_entry *entry; struct arm_pmu *pmu; int cpu; guard(mutex)(&arm_pmus_lock); /* * It is safe to use a stale cpu to iterate the list of PMUs so long as * the same value is used for the entirety of the loop. Given this, and * the fact that no percpu data is used for the lookup there is no need * to disable preemption. * * It is still necessary to get a valid cpu, though, to probe for the * default PMU instance as userspace is not required to specify a PMU * type. In order to uphold the preexisting behavior KVM selects the * PMU instance for the core during vcpu init. A dependent use * case would be a user with disdain of all things big.LITTLE that * affines the VMM to a particular cluster of cores. * * In any case, userspace should just do the sane thing and use the UAPI * to select a PMU type directly. But, be wary of the baggage being * carried here. */ cpu = raw_smp_processor_id(); list_for_each_entry(entry, &arm_pmus, entry) { pmu = entry->arm_pmu; if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) return pmu; } return NULL; } static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) { u32 hi[2], lo[2]; bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; } static u64 compute_pmceid0(struct arm_pmu *pmu) { u64 val = __compute_pmceid(pmu, 0); /* always support SW_INCR */ val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); /* always support CHAIN */ val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); return val; } static u64 compute_pmceid1(struct arm_pmu *pmu) { u64 val = __compute_pmceid(pmu, 1); /* * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled * as RAZ */ val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); return val; } u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) { struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; unsigned long *bmap = vcpu->kvm->arch.pmu_filter; u64 val, mask = 0; int base, i, nr_events; if (!pmceid1) { val = compute_pmceid0(cpu_pmu); base = 0; } else { val = compute_pmceid1(cpu_pmu); base = 32; } if (!bmap) return val; nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; for (i = 0; i < 32; i += 8) { u64 byte; byte = bitmap_get_value8(bmap, base + i); mask |= byte << i; if (nr_events >= (0x4000 + base + 32)) { byte = bitmap_get_value8(bmap, 0x4000 + base + i); mask |= byte << (32 + i); } } return val & mask; } void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask); __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask); __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask); kvm_pmu_reprogram_counter_mask(vcpu, mask); } int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { if (!vcpu->arch.pmu.created) return -EINVAL; /* * A valid interrupt configuration for the PMU is either to have a * properly configured interrupt number and using an in-kernel * irqchip, or to not have an in-kernel GIC and not set an IRQ. */ if (irqchip_in_kernel(vcpu->kvm)) { int irq = vcpu->arch.pmu.irq_num; /* * If we are using an in-kernel vgic, at this point we know * the vgic will be initialized, so we can check the PMU irq * number against the dimensions of the vgic and make sure * it's valid. */ if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) return -EINVAL; } else if (kvm_arm_pmu_irq_initialized(vcpu)) { return -EINVAL; } return 0; } static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) { int ret; /* * If using the PMU with an in-kernel virtual GIC * implementation, we require the GIC to be already * initialized when initializing the PMU. */ if (!vgic_initialized(vcpu->kvm)) return -ENODEV; if (!kvm_arm_pmu_irq_initialized(vcpu)) return -ENXIO; ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, &vcpu->arch.pmu); if (ret) return ret; } init_irq_work(&vcpu->arch.pmu.overflow_work, kvm_pmu_perf_overflow_notify_vcpu); vcpu->arch.pmu.created = true; return 0; } /* * For one VM the interrupt type must be same for each vcpu. * As a PPI, the interrupt number is the same for all vcpus, * while as an SPI it must be a separate number per vcpu. */ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_arm_pmu_irq_initialized(vcpu)) continue; if (irq_is_ppi(irq)) { if (vcpu->arch.pmu.irq_num != irq) return false; } else { if (vcpu->arch.pmu.irq_num == irq) return false; } } return true; } /** * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. * @kvm: The kvm pointer */ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; /* * PMUv3 requires that all event counters are capable of counting any * event, though the same may not be true of non-PMUv3 hardware. */ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) return 1; /* * The arm_pmu->cntr_mask considers the fixed counter(s) as well. * Ignore those and return only the general-purpose counters. */ return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); } static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) { kvm->arch.nr_pmu_counters = nr; /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { struct kvm_vcpu *vcpu; unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); val &= ~MDCR_EL2_HPMN; val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); } } } static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) { lockdep_assert_held(&kvm->arch.config_lock); kvm->arch.arm_pmu = arm_pmu; kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); } /** * kvm_arm_set_default_pmu - No PMU set, get the default one. * @kvm: The kvm pointer * * The observant among you will notice that the supported_cpus * mask does not get updated for the default PMU even though it * is quite possible the selected instance supports only a * subset of cores in the system. This is intentional, and * upholds the preexisting behavior on heterogeneous systems * where vCPUs can be scheduled on any core but the guest * counters could stop working. */ int kvm_arm_set_default_pmu(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); if (!arm_pmu) return -ENODEV; kvm_arm_set_pmu(kvm, arm_pmu); return 0; } static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; struct arm_pmu_entry *entry; struct arm_pmu *arm_pmu; int ret = -ENXIO; lockdep_assert_held(&kvm->arch.config_lock); mutex_lock(&arm_pmus_lock); list_for_each_entry(entry, &arm_pmus, entry) { arm_pmu = entry->arm_pmu; if (arm_pmu->pmu.type == pmu_id) { if (kvm_vm_has_ran_once(kvm) || (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { ret = -EBUSY; break; } kvm_arm_set_pmu(kvm, arm_pmu); cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); ret = 0; break; } } mutex_unlock(&arm_pmus_lock); return ret; } static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) { struct kvm *kvm = vcpu->kvm; if (!kvm->arch.arm_pmu) return -EINVAL; if (n > kvm_arm_pmu_get_max_counters(kvm)) return -EINVAL; kvm_arm_set_nr_counters(kvm, n); return 0; } int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.config_lock); if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (vcpu->arch.pmu.created) return -EBUSY; switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; int irq; if (!irqchip_in_kernel(kvm)) return -EINVAL; if (get_user(irq, uaddr)) return -EFAULT; /* The PMU overflow interrupt can be a PPI or a valid SPI. */ if (!(irq_is_ppi(irq) || irq_is_spi(irq))) return -EINVAL; if (!pmu_irq_is_valid(kvm, irq)) return -EINVAL; if (kvm_arm_pmu_irq_initialized(vcpu)) return -EBUSY; kvm_debug("Set kvm ARM PMU irq: %d\n", irq); vcpu->arch.pmu.irq_num = irq; return 0; } case KVM_ARM_VCPU_PMU_V3_FILTER: { u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); struct kvm_pmu_event_filter __user *uaddr; struct kvm_pmu_event_filter filter; int nr_events; /* * Allow userspace to specify an event filter for the entire * event range supported by PMUVer of the hardware, rather * than the guest's PMUVer for KVM backward compatibility. */ nr_events = __kvm_pmu_event_mask(pmuver) + 1; uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; if (copy_from_user(&filter, uaddr, sizeof(filter))) return -EFAULT; if (((u32)filter.base_event + filter.nevents) > nr_events || (filter.action != KVM_PMU_EVENT_ALLOW && filter.action != KVM_PMU_EVENT_DENY)) return -EINVAL; if (kvm_vm_has_ran_once(kvm)) return -EBUSY; if (!kvm->arch.pmu_filter) { kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); if (!kvm->arch.pmu_filter) return -ENOMEM; /* * The default depends on the first applied filter. * If it allows events, the default is to deny. * Conversely, if the first filter denies a set of * events, the default is to allow. */ if (filter.action == KVM_PMU_EVENT_ALLOW) bitmap_zero(kvm->arch.pmu_filter, nr_events); else bitmap_fill(kvm->arch.pmu_filter, nr_events); } if (filter.action == KVM_PMU_EVENT_ALLOW) bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); else bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); return 0; } case KVM_ARM_VCPU_PMU_V3_SET_PMU: { int __user *uaddr = (int __user *)(long)attr->addr; int pmu_id; if (get_user(pmu_id, uaddr)) return -EFAULT; return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); } case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; unsigned int n; if (get_user(n, uaddr)) return -EFAULT; return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } return -ENXIO; } int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; int irq; if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; if (!kvm_vcpu_has_pmu(vcpu)) return -ENODEV; if (!kvm_arm_pmu_irq_initialized(vcpu)) return -ENXIO; irq = vcpu->arch.pmu.irq_num; return put_user(irq, uaddr); } } return -ENXIO; } int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: case KVM_ARM_VCPU_PMU_V3_FILTER: case KVM_ARM_VCPU_PMU_V3_SET_PMU: case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: if (kvm_vcpu_has_pmu(vcpu)) return 0; } return -ENXIO; } u8 kvm_arm_pmu_get_pmuver_limit(void) { unsigned int pmuver; pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); /* * Spoof a barebones PMUv3 implementation if the system supports IMPDEF * traps of the PMUv3 sysregs */ if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) return ID_AA64DFR0_EL1_PMUVer_IMP; /* * Otherwise, treat IMPLEMENTATION DEFINED functionality as * unimplemented */ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) return 0; return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); } /** * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU * @vcpu: The vcpu pointer */ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); u64 n = vcpu->kvm->arch.nr_pmu_counters; if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); } void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) { bool reprogrammed = false; unsigned long mask; int i; mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); for_each_set_bit(i, &mask, 32) { struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); /* * We only need to reconfigure events where the filter is * different at EL1 vs. EL2, as we're multiplexing the true EL1 * event filter bit for nested. */ if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) continue; kvm_pmu_create_perf_event(pmc); reprogrammed = true; } if (reprogrammed) kvm_vcpu_pmu_restore_guest(vcpu); }
77 78 78 2 2 78 76 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free);
176 176 176 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"}, \ {I_LINKABLE, "I_LINKABLE"}, \ {I_WB_SWITCH, "I_WB_SWITCH"}, \ {I_OVL_INUSE, "I_OVL_INUSE"}, \ {I_CREATING, "I_CREATING"}, \ {I_DONTCACHE, "I_DONTCACHE"}, \ {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \ {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \ {I_LRU_ISOLATING, "I_LRU_ISOLATING"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, struct dirty_throttle_control *dtc, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, dtc, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, wb_setpoint) __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = dtc->limit; __entry->setpoint = (dtc->limit + freerun) / 2; __entry->dirty = dtc->dirty; __entry->wb_setpoint = __entry->setpoint * dtc->wb_thresh / (dtc->thresh + 1); __entry->wb_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->wb_setpoint, __entry->wb_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; mask = select_poll_one(i, wait, in, out, bit, busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (unlikely(n < 0)) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask, filter; if (unlikely(fd < 0)) return 0; CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); pfd->revents = mangle_poll(mask); if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
296 296 295 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> #include <linux/export.h> #include <linux/memblock.h> #include <linux/numa.h> /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** * alloc_cpumask_var_node - allocate a struct cpumask on a given node * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * @node: memory node from which to allocate or %NUMA_NO_NODE * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * Return: TRUE if memory allocation succeeded, FALSE otherwise. * * In addition, mask will be NULL if this fails. Note that gcc is * usually smart enough to know that mask can never be NULL if * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case * too. */ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { *mask = kmalloc_node(cpumask_size(), flags, node); #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (!*mask) { printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); dump_stack(); } #endif return *mask != NULL; } EXPORT_SYMBOL(alloc_cpumask_var_node); /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop (in <linux/cpumask.h>). * Either returns an allocated (zero-filled) cpumask, or causes the * system to panic. */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** * free_cpumask_var - frees memory allocated for a struct cpumask. * @mask: cpumask to free * * This is safe on a NULL mask. */ void free_cpumask_var(cpumask_var_t mask) { kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); /** * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var * @mask: cpumask to free */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { memblock_free(mask, cpumask_size()); } #endif /** * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * * Return: online CPU according to a numa aware policy; local cpus are returned * first, followed by non-local ones, then it wraps around. * * For those who wants to enumerate all CPUs based on their NUMA distances, * i.e. call this function in a loop, like: * * for (i = 0; i < num_online_cpus(); i++) { * cpu = cpumask_local_spread(i, node); * do_something(cpu); * } * * There's a better alternative based on for_each()-like iterators: * * for_each_numa_hop_mask(mask, node) { * for_each_cpu_andnot(cpu, mask, prev) * do_something(cpu); * prev = mask; * } * * It's simpler and more verbose than above. Complexity of iterator-based * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while * cpumask_local_spread() when called for each cpu is * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { unsigned int cpu; /* Wrap: we always want a cpu. */ i %= num_online_cpus(); cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node); WARN_ON(cpu >= nr_cpu_ids); return cpu; } EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); /** * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p. * @src1p: first &cpumask for intersection * @src2p: second &cpumask for intersection * * Iterated calls using the same srcp1 and srcp2 will be distributed within * their intersection. * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = cpumask_next_and_wrap(prev, src1p, src2p); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); /** * cpumask_any_distribute - Return an arbitrary cpu from srcp * @srcp: &cpumask for selection * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_distribute(const struct cpumask *srcp) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = cpumask_next_wrap(prev, srcp); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_distribute);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values and helper functions for the ChaCha and XChaCha stream ciphers. * * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's * security. Here they share the same key size, tfm context, and setkey * function; only their IV size and encrypt/decrypt function differ. * * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is * recommended to use the 20-round variant ChaCha20. However, the other * variants can be needed in some performance-sensitive scenarios. The generic * ChaCha code currently allows only the 20 and 12-round variants. */ #ifndef _CRYPTO_CHACHA_H #define _CRYPTO_CHACHA_H #include <linux/unaligned.h> #include <linux/string.h> #include <linux/types.h> /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA_IV_SIZE 16 #define CHACHA_KEY_SIZE 32 #define CHACHA_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 #define CHACHA_KEY_WORDS 8 #define CHACHA_STATE_WORDS 16 #define HCHACHA_OUT_WORDS 8 /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 struct chacha_state { u32 x[CHACHA_STATE_WORDS]; }; void chacha_block_generic(struct chacha_state *state, u8 out[CHACHA_BLOCK_SIZE], int nrounds); static inline void chacha20_block(struct chacha_state *state, u8 out[CHACHA_BLOCK_SIZE]) { chacha_block_generic(state, out, 20); } void hchacha_block_arch(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); static inline void hchacha_block(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) hchacha_block_arch(state, out, nrounds); else hchacha_block_generic(state, out, nrounds); } enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, CHACHA_CONSTANT_ND_3 = 0x3320646eU, CHACHA_CONSTANT_2_BY = 0x79622d32U, CHACHA_CONSTANT_TE_K = 0x6b206574U }; static inline void chacha_init_consts(struct chacha_state *state) { state->x[0] = CHACHA_CONSTANT_EXPA; state->x[1] = CHACHA_CONSTANT_ND_3; state->x[2] = CHACHA_CONSTANT_2_BY; state->x[3] = CHACHA_CONSTANT_TE_K; } static inline void chacha_init(struct chacha_state *state, const u32 key[CHACHA_KEY_WORDS], const u8 iv[CHACHA_IV_SIZE]) { chacha_init_consts(state); state->x[4] = key[0]; state->x[5] = key[1]; state->x[6] = key[2]; state->x[7] = key[3]; state->x[8] = key[4]; state->x[9] = key[5]; state->x[10] = key[6]; state->x[11] = key[7]; state->x[12] = get_unaligned_le32(iv + 0); state->x[13] = get_unaligned_le32(iv + 4); state->x[14] = get_unaligned_le32(iv + 8); state->x[15] = get_unaligned_le32(iv + 12); } void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); void chacha_crypt_generic(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); static inline void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) chacha_crypt_arch(state, dst, src, bytes, nrounds); else chacha_crypt_generic(state, dst, src, bytes, nrounds); } static inline void chacha20_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes) { chacha_crypt(state, dst, src, bytes, 20); } static inline void chacha_zeroize_state(struct chacha_state *state) { memzero_explicit(state, sizeof(*state)); } #if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA) bool chacha_is_arch_optimized(void); #else static inline bool chacha_is_arch_optimized(void) { return false; } #endif #endif /* _CRYPTO_CHACHA_H */
287 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H #include <linux/static_call_types.h> #include <linux/resume_user_mode.h> #include <linux/syscalls.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/tick.h> /* Transfer to guest mode work */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK #ifndef ARCH_XFER_TO_GUEST_MODE_WORK # define ARCH_XFER_TO_GUEST_MODE_WORK (0) #endif #define XFER_TO_GUEST_MODE_WORK \ (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; /** * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest * mode work handling function. * @vcpu: Pointer to current's VCPU data * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() * * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be * replaced by architecture specific code. */ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work); #ifndef arch_xfer_to_guest_mode_work static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work) { return 0; } #endif /** * xfer_to_guest_mode_handle_work - Check and handle pending work which needs * to be handled before going to guest mode * @vcpu: Pointer to current's VCPU data * * Returns: 0 or an error code */ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); /** * xfer_to_guest_mode_prepare - Perform last minute preparation work that * need to be handled while IRQs are disabled * upon entering to guest. * * Has to be invoked with interrupts disabled before the last call * to xfer_to_guest_mode_work_pending(). */ static inline void xfer_to_guest_mode_prepare(void) { lockdep_assert_irqs_disabled(); tick_nohz_user_enter_prepare(); } /** * __xfer_to_guest_mode_work_pending - Check if work is pending * * Returns: True if work pending, False otherwise. * * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from * interrupt enabled code for racy quick checks with care. */ static inline bool __xfer_to_guest_mode_work_pending(void) { unsigned long ti_work = read_thread_flags(); return !!(ti_work & XFER_TO_GUEST_MODE_WORK); } /** * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be * handled before returning to guest mode * * Returns: True if work pending, False otherwise. * * Has to be invoked with interrupts disabled before the transition to * guest mode. */ static inline bool xfer_to_guest_mode_work_pending(void) { lockdep_assert_irqs_disabled(); return __xfer_to_guest_mode_work_pending(); } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ #endif
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 // SPDX-License-Identifier: GPL-1.0+ /* * originally based on the dummy device. * * Copyright 1999, Thomas Davis, tadavis@lbl.gov. * Based on dummy.c, and eql.c devices. * * bonding.c: an Ethernet Bonding driver * * This is useful to talk to a Cisco EtherChannel compatible equipment: * Cisco 5500 * Sun Trunking (Solaris) * Alteon AceDirector Trunks * Linux Bonding * and probably many L2 switches ... * * How it works: * ifconfig bond0 ipaddress netmask up * will setup a network device, with an ip address. No mac address * will be assigned at this time. The hw mac address will come from * the first slave bonded to the channel. All slaves will then use * this hw mac address. * * ifconfig bond0 down * will release all slaves, marking them as down. * * ifenslave bond0 eth0 * will attach eth0 to bond0 as a slave. eth0 hw mac address will either * a: be used as initial mac address * b: if a hw mac address already is there, eth0's hw mac address * will then be set from bond0. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/ioport.h> #include <linux/in.h> #include <net/ip.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/socket.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/bitops.h> #include <linux/io.h> #include <asm/dma.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/smp.h> #include <linux/if_ether.h> #include <net/arp.h> #include <linux/mii.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/if_bonding.h> #include <linux/phy.h> #include <linux/jiffies.h> #include <linux/preempt.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_sched.h> #include <linux/rculist.h> #include <net/flow_dissector.h> #include <net/xfrm.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #if IS_ENABLED(CONFIG_TLS_DEVICE) #include <net/tls.h> #endif #include <net/ip6_route.h> #include <net/netdev_lock.h> #include <net/xdp.h> #include "bonding_priv.h" /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ static int max_bonds = BOND_DEFAULT_MAX_BONDS; static int tx_queues = BOND_DEFAULT_TX_QUEUES; static int num_peer_notif = 1; static int miimon; static int updelay; static int downdelay; static int use_carrier = 1; static char *mode; static char *primary; static char *primary_reselect; static char *lacp_rate; static int min_links; static char *ad_select; static char *xmit_hash_policy; static int arp_interval; static char *arp_ip_target[BOND_MAX_ARP_TARGETS]; static char *arp_validate; static char *arp_all_targets; static char *fail_over_mac; static int all_slaves_active; static struct bond_params bonding_defaults; static int resend_igmp = BOND_DEFAULT_RESEND_IGMP; static int packets_per_slave = 1; static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); module_param(tx_queues, int, 0); MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)"); module_param_named(num_grat_arp, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on " "failover event (alias of num_unsol_na)"); module_param_named(num_unsol_na, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on " "failover event (alias of num_grat_arp)"); module_param(miimon, int, 0); MODULE_PARM_DESC(miimon, "Link check interval in milliseconds"); module_param(updelay, int, 0); MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds"); module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; " "0 for off, 1 for on (default)"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " "6 for balance-alb"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); MODULE_PARM_DESC(primary_reselect, "Reselect primary slave " "once it comes up; " "0 for always (default), " "1 for only if speed of primary is " "better, " "2 for only on active slave " "failure"); module_param(lacp_rate, charp, 0); MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; " "0 for slow, 1 for fast"); module_param(ad_select, charp, 0); MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; " "0 for stable (default), 1 for bandwidth, " "2 for count"); module_param(min_links, int, 0); MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier"); module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; " "0 for layer 2 (default), 1 for layer 3+4, " "2 for layer 2+3, 3 for encap layer 2+3, " "4 for encap layer 3+4, 5 for vlan+srcmac"); module_param(arp_interval, int, 0); MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); module_param_array(arp_ip_target, charp, NULL, 0); MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form"); module_param(arp_validate, charp, 0); MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; " "0 for none (default), 1 for active, " "2 for backup, 3 for all"); module_param(arp_all_targets, charp, 0); MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all"); module_param(fail_over_mac, charp, 0); MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to " "the same MAC; 0 for none (default), " "1 for active, 2 for follow"); module_param(all_slaves_active, int, 0); MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface " "by setting active flag for all slaves; " "0 for never (default), 1 for always."); module_param(resend_igmp, int, 0); MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on " "link failure"); module_param(packets_per_slave, int, 0); MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr " "mode; 0 for a random slave, 1 packet per " "slave (default), >1 packets per slave."); module_param(lp_interval, uint, 0); MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " "the bonding driver sends learning packets to " "each slaves peer switch. The default is 1."); /*----------------------------- Global variables ----------------------------*/ #ifdef CONFIG_NET_POLL_CONTROLLER atomic_t netpoll_block_tx = ATOMIC_INIT(0); #endif unsigned int bond_net_id __read_mostly; static const struct flow_dissector_key flow_keys_bonding_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct flow_keys, icmp), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static struct flow_dissector flow_keys_bonding __read_mostly; /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); static void bond_uninit(struct net_device *bond_dev); static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats); static void bond_slave_arr_handler(struct work_struct *work); static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod); static void bond_netdev_notify_work(struct work_struct *work); /*---------------------------- General routines -----------------------------*/ const char *bond_mode_name(int mode) { static const char *names[] = { [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)", [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)", [BOND_MODE_XOR] = "load balancing (xor)", [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)", [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", }; if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) return "unknown"; return names[mode]; } /** * bond_dev_queue_xmit - Prepare skb for xmit. * * @bond: bond device that got this skb for tx. * @skb: hw accel VLAN tagged skb to transmit * @slave_dev: slave that is supposed to xmit this skbuff */ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev) { skb->dev = slave_dev; BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); if (unlikely(netpoll_tx_running(bond->dev))) return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); return dev_queue_xmit(skb); } static bool bond_sk_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: case BOND_MODE_XOR: if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34) return true; fallthrough; default: return false; } } bool bond_xdp_check(struct bonding *bond, int mode) { switch (mode) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; case BOND_MODE_8023AD: case BOND_MODE_XOR: /* vlan+srcmac is not supported with XDP as in most cases the 802.1q * payload is not in the packet due to hardware offload. */ if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) return true; fallthrough; default: return false; } } /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, * We don't protect the slave list iteration with a lock because: * a. This operation is performed in IOCTL context, * b. The operation is protected by the RTNL semaphore in the 8021q code, * c. Holding a lock with BH disabled while directly calling a base driver * entry point is generally a BAD idea. * * The design of synchronization/protection for this operation in the 8021q * module is good for one or more VLAN devices over a single physical device * and cannot be extended for a teaming solution like bonding, so there is a * potential race condition here where a net device from the vlan group might * be referenced (either by a base driver or the 8021q code) while it is being * removed from the system. However, it turns out we're not making matters * worse, and if it works for regular VLAN usage it will work here too. */ /** * bond_vlan_rx_add_vid - Propagates adding an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being added */ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res; bond_for_each_slave(bond, slave, iter) { res = vlan_vid_add(slave->dev, proto, vid); if (res) goto unwind; } return 0; unwind: /* unwind to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; vlan_vid_del(rollback_slave->dev, proto, vid); } return res; } /** * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being removed */ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) vlan_vid_del(slave->dev, proto, vid); if (bond_is_lb(bond)) bond_alb_clear_vlan(bond, vid); return 0; } /*---------------------------------- XFRM -----------------------------------*/ #ifdef CONFIG_XFRM_OFFLOAD /** * bond_ipsec_dev - Get active device for IPsec offload * @xs: pointer to transformer state struct * * Context: caller must hold rcu_read_lock. * * Return: the device for ipsec offload, or NULL if not exist. **/ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct bonding *bond; struct slave *slave; bond = netdev_priv(bond_dev); if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) return NULL; slave = rcu_dereference(bond->curr_active_slave); if (!slave) return NULL; if (!xs->xso.real_dev) return NULL; if (xs->xso.real_dev != slave->dev) pr_warn_ratelimited("%s: (slave %s): not same with IPsec offload real dev %s\n", bond_dev->name, slave->dev->name, xs->xso.real_dev->name); return slave->dev; } /** * bond_ipsec_add_sa - program device with a security association * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ static int bond_ipsec_add_sa(struct net_device *bond_dev, struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; int err; if (!bond_dev) return -EINVAL; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; netdev_hold(real_dev, &tracker, GFP_ATOMIC); rcu_read_unlock(); if (!real_dev) { err = -ENODEV; goto out; } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); err = -EINVAL; goto out; } ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL); if (!ipsec) { err = -ENOMEM; goto out; } err = real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, xs, extack); if (!err) { xs->xso.real_dev = real_dev; ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); mutex_lock(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); mutex_unlock(&bond->ipsec_lock); } else { kfree(ipsec); } out: netdev_put(real_dev, &tracker); return err; } static void bond_ipsec_add_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); goto out; } list_for_each_entry(ipsec, &bond->ipsec_list, list) { /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue; if (real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); continue; } spin_lock_bh(&ipsec->xs->lock); /* xs might have been killed by the user during the migration * to the new dev, but bond_ipsec_del_sa() should have done * nothing, as xso.real_dev is NULL. * Delete it from the device we just added it to. The pending * bond_ipsec_free_sa() call will do the rest of the cleanup. */ if (ipsec->xs->km.state == XFRM_STATE_DEAD && real_dev->xfrmdev_ops->xdo_dev_state_delete) real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, ipsec->xs); ipsec->xs->xso.real_dev = real_dev; spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); } /** * bond_ipsec_del_sa - clear out this specific SA * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct **/ static void bond_ipsec_del_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; if (!bond_dev || !xs->xso.real_dev) return; real_dev = xs->xso.real_dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); return; } real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); } static void bond_ipsec_del_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); continue; } spin_lock_bh(&ipsec->xs->lock); ipsec->xs->xso.real_dev = NULL; /* Don't double delete states killed by the user. */ if (ipsec->xs->km.state != XFRM_STATE_DEAD) real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, ipsec->xs); spin_unlock_bh(&ipsec->xs->lock); if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, ipsec->xs); } mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_free_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; struct bond_ipsec *ipsec; struct bonding *bond; if (!bond_dev) return; bond = netdev_priv(bond_dev); mutex_lock(&bond->ipsec_lock); if (!xs->xso.real_dev) goto out; real_dev = xs->xso.real_dev; xs->xso.real_dev = NULL; if (real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); out: list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); kfree(ipsec); break; } } mutex_unlock(&bond->ipsec_lock); } /** * bond_ipsec_offload_ok - can this packet use the xfrm hw offload * @skb: current data packet * @xs: pointer to transformer state struct **/ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev || netif_is_bond_master(real_dev)) { rcu_read_unlock(); return false; } rcu_read_unlock(); return true; } /** * bond_advance_esn_state - ESN support for IPSec HW offload * @xs: pointer to transformer state struct **/ static void bond_advance_esn_state(struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_advance_esn) { pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_advance_esn\n", __func__, real_dev->name); goto out; } real_dev->xfrmdev_ops->xdo_dev_state_advance_esn(xs); out: rcu_read_unlock(); } /** * bond_xfrm_update_stats - Update xfrm state * @xs: pointer to transformer state struct **/ static void bond_xfrm_update_stats(struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_update_stats) { pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_update_stats\n", __func__, real_dev->name); goto out; } real_dev->xfrmdev_ops->xdo_dev_state_update_stats(xs); out: rcu_read_unlock(); } static const struct xfrmdev_ops bond_xfrmdev_ops = { .xdo_dev_state_add = bond_ipsec_add_sa, .xdo_dev_state_delete = bond_ipsec_del_sa, .xdo_dev_state_free = bond_ipsec_free_sa, .xdo_dev_offload_ok = bond_ipsec_offload_ok, .xdo_dev_state_advance_esn = bond_advance_esn_state, .xdo_dev_state_update_stats = bond_xfrm_update_stats, }; #endif /* CONFIG_XFRM_OFFLOAD */ /*------------------------------- Link status -------------------------------*/ /* Set the carrier state for the master according to the state of its * slaves. If any slaves are up, the master is up. In 802.3ad mode, * do special 802.3ad magic. * * Returns zero if carrier state does not change, nonzero if it does. */ int bond_set_carrier(struct bonding *bond) { struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) goto down; if (BOND_MODE(bond) == BOND_MODE_8023AD) return bond_3ad_set_carrier(bond); bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); return 1; } return 0; } } down: if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); return 1; } return 0; } /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, * and return. Return 1 if speed or duplex settings are * UNKNOWN; 0 otherwise. */ static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; int res; slave->speed = SPEED_UNKNOWN; slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); if (res < 0) return 1; if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) return 1; switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: return 1; } slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex; return 0; } const char *bond_slave_link_status(s8 link) { switch (link) { case BOND_LINK_UP: return "up"; case BOND_LINK_FAIL: return "going down"; case BOND_LINK_DOWN: return "down"; case BOND_LINK_BACK: return "going back"; default: return "unknown"; } } /* if <dev> supports MII link status reporting, check its link status. * * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(), * depending upon the setting of the use_carrier parameter. * * Return either BMSR_LSTATUS, meaning that the link is up (or we * can't tell and just pretend it is), or 0, meaning that the link is * down. * * If reporting is non-zero, instead of faking link up, return -1 if * both ETHTOOL and MII ioctls fail (meaning the device does not * support them). If use_carrier is set, return whatever it says. * It'd be nice if there was a good way to tell if a driver supports * netif_carrier, but there really isn't. */ static int bond_check_dev_link(struct bonding *bond, struct net_device *slave_dev, int reporting) { const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct mii_ioctl_data *mii; struct ifreq ifr; int ret; if (!reporting && !netif_running(slave_dev)) return 0; if (bond->params.use_carrier) return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0; /* Try to get link status using Ethtool first. */ if (slave_dev->ethtool_ops->get_link) { netdev_lock_ops(slave_dev); ret = slave_dev->ethtool_ops->get_link(slave_dev); netdev_unlock_ops(slave_dev); return ret ? BMSR_LSTATUS : 0; } /* Ethtool can't be used, fallback to MII ioctls. */ if (slave_ops->ndo_eth_ioctl) { /* TODO: set pointer to correct ioctl on a per team member * bases to make this more efficient. that is, once * we determine the correct ioctl, we will always * call it and not the others for that team * member. */ /* We cannot assume that SIOCGMIIPHY will also read a * register; not all network drivers (e.g., e100) * support that. */ /* Yes, the mii is overlaid on the ifreq.ifr_ifru */ strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ); mii = if_mii(&ifr); if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) { mii->reg_num = MII_BMSR; if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0) return mii->val_out & BMSR_LSTATUS; } } /* If reporting, report that either there's no ndo_eth_ioctl, * or both SIOCGMIIREG and get_link failed (meaning that we * cannot report link status). If not reporting, pretend * we're ok. */ return reporting ? -1 : BMSR_LSTATUS; } /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ static int bond_set_promiscuity(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_promiscuity(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_promiscuity(slave->dev, inc); if (err) return err; } } return err; } /* Push the allmulti flag down to all slaves */ static int bond_set_allmulti(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_allmulti(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_allmulti(slave->dev, inc); if (err) return err; } } return err; } /* Retrieve the list of registered multicast addresses for the bonding * device and retransmit an IGMP JOIN request to the current active * slave. */ static void bond_resend_igmp_join_requests_delayed(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mcast_work.work); if (!rtnl_trylock()) { queue_delayed_work(bond->wq, &bond->mcast_work, 1); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev); if (bond->igmp_retrans > 1) { bond->igmp_retrans--; queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5); } rtnl_unlock(); } /* Flush bond's hardware addresses from slave */ static void bond_hw_addr_flush(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); dev_uc_unsync(slave_dev, bond_dev); dev_mc_unsync(slave_dev, bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_del(slave_dev, lacpdu_mcast_addr); } /*--------------------------- Active slave change ---------------------------*/ /* Update the hardware address list and promisc/allmulti for the new and * old active slaves (if any). Modes that are not using primary keep all * slaves up date at all times; only the modes that use primary need to call * this function to swap these settings during a failover. */ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, struct slave *old_active) { if (old_active) { if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(old_active->dev, -1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(old_active->dev, -1); if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); bond_slave_ns_maddrs_add(bond, old_active); } if (new_active) { /* FIXME: Signal errors upstream. */ if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(new_active->dev, 1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(new_active->dev, 1); if (bond->dev->flags & IFF_UP) { netif_addr_lock_bh(bond->dev); dev_uc_sync(new_active->dev, bond->dev); dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } bond_slave_ns_maddrs_del(bond, new_active); } } /** * bond_set_dev_addr - clone slave's address to bond * @bond_dev: bond net device * @slave_dev: slave net device * * Should be called with RTNL held. */ static int bond_set_dev_addr(struct net_device *bond_dev, struct net_device *slave_dev) { int err; slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n", bond_dev, slave_dev, slave_dev->addr_len); err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL); if (err) return err; __dev_addr_set(bond_dev, slave_dev->dev_addr, slave_dev->addr_len); bond_dev->addr_assign_type = NET_ADDR_STOLEN; call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev); return 0; } static struct slave *bond_get_old_active(struct bonding *bond, struct slave *new_active) { struct slave *slave; struct list_head *iter; bond_for_each_slave(bond, slave, iter) { if (slave == new_active) continue; if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr)) return slave; } return NULL; } /* bond_do_fail_over_mac * * Perform special MAC address swapping for fail_over_mac settings * * Called with RTNL */ static void bond_do_fail_over_mac(struct bonding *bond, struct slave *new_active, struct slave *old_active) { u8 tmp_mac[MAX_ADDR_LEN]; struct sockaddr_storage ss; int rv; switch (bond->params.fail_over_mac) { case BOND_FOM_ACTIVE: if (new_active) { rv = bond_set_dev_addr(bond->dev, new_active->dev); if (rv) slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n", -rv); } break; case BOND_FOM_FOLLOW: /* if new_active && old_active, swap them * if just old_active, do nothing (going to no active slave) * if just new_active, set new_active to bond's MAC */ if (!new_active) return; if (!old_active) old_active = bond_get_old_active(bond, new_active); if (old_active) { bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr, new_active->dev->addr_len); bond_hw_addr_copy(ss.__data, old_active->dev->dev_addr, old_active->dev->addr_len); ss.ss_family = new_active->dev->type; } else { bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; } rv = dev_set_mac_address(new_active->dev, &ss, NULL); if (rv) { slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n", -rv); goto out; } if (!old_active) goto out; bond_hw_addr_copy(ss.__data, tmp_mac, new_active->dev->addr_len); ss.ss_family = old_active->dev->type; rv = dev_set_mac_address(old_active->dev, &ss, NULL); if (rv) slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n", -rv); out: break; default: netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n", bond->params.fail_over_mac); break; } } /** * bond_choose_primary_or_current - select the primary or high priority slave * @bond: our bonding struct * * - Check if there is a primary link. If the primary link was set and is up, * go on and do link reselection. * * - If primary link is not set or down, find the highest priority link. * If the highest priority link is not current slave, set it as primary * link and do link reselection. */ static struct slave *bond_choose_primary_or_current(struct bonding *bond) { struct slave *prim = rtnl_dereference(bond->primary_slave); struct slave *curr = rtnl_dereference(bond->curr_active_slave); struct slave *slave, *hprio = NULL; struct list_head *iter; if (!prim || prim->link != BOND_LINK_UP) { bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { hprio = hprio ?: slave; if (slave->prio > hprio->prio) hprio = slave; } } if (hprio && hprio != curr) { prim = hprio; goto link_reselect; } if (!curr || curr->link != BOND_LINK_UP) return NULL; return curr; } if (bond->force_primary) { bond->force_primary = false; return prim; } link_reselect: if (!curr || curr->link != BOND_LINK_UP) return prim; /* At this point, prim and curr are both up */ switch (bond->params.primary_reselect) { case BOND_PRI_RESELECT_ALWAYS: return prim; case BOND_PRI_RESELECT_BETTER: if (prim->speed < curr->speed) return curr; if (prim->speed == curr->speed && prim->duplex <= curr->duplex) return curr; return prim; case BOND_PRI_RESELECT_FAILURE: return curr; default: netdev_err(bond->dev, "impossible primary_reselect %d\n", bond->params.primary_reselect); return curr; } } /** * bond_find_best_slave - select the best available slave to be the active one * @bond: our bonding struct */ static struct slave *bond_find_best_slave(struct bonding *bond) { struct slave *slave, *bestslave = NULL; struct list_head *iter; int mintime = bond->params.updelay; slave = bond_choose_primary_or_current(bond); if (slave) return slave; bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) return slave; if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) && slave->delay < mintime) { mintime = slave->delay; bestslave = slave; } } return bestslave; } /* must be called in RCU critical section or with RTNL held */ static bool bond_should_notify_peers(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); if (!slave || !bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || !netif_carrier_ok(bond->dev) || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", slave ? slave->dev->name : "NULL"); return true; } /** * bond_change_active_slave - change the active slave into the specified one * @bond: our bonding struct * @new_active: the new slave to make the active one * * Set the new slave to the bond's settings and unset them on the old * curr_active_slave. * Setting include flags, mc-list, promiscuity, allmulti, etc. * * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP, * because it is apparently the best available slave we have, even though its * updelay hasn't timed out yet. * * Caller must hold RTNL. */ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) { struct slave *old_active; ASSERT_RTNL(); old_active = rtnl_dereference(bond->curr_active_slave); if (old_active == new_active) return; #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_del_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ if (new_active) { new_active->last_link_up = jiffies; if (new_active->link == BOND_LINK_BACK) { if (bond_uses_primary(bond)) { slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n", (bond->params.updelay - new_active->delay) * bond->params.miimon); } new_active->delay = 0; bond_set_slave_link_state(new_active, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_handle_link_change(new_active, BOND_LINK_UP); if (bond_is_lb(bond)) bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP); } else { if (bond_uses_primary(bond)) slave_info(bond->dev, new_active->dev, "making interface the new active one\n"); } } if (bond_uses_primary(bond)) bond_hw_addr_swap(bond, new_active, old_active); if (bond_is_lb(bond)) { bond_alb_handle_active_change(bond, new_active); if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); } else { rcu_assign_pointer(bond->curr_active_slave, new_active); } if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) { bool should_notify_peers = false; bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); if (bond->params.fail_over_mac) bond_do_fail_over_mac(bond, new_active, old_active); if (netif_running(bond->dev)) { bond->send_peer_notif = bond->params.num_peer_notif * max(1, bond->params.peer_notif_delay); should_notify_peers = bond_should_notify_peers(bond); } call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } } } #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_add_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ /* resend IGMP joins since active slave has changed or * all were sent on curr_active_slave. * resend only if bond is brought up with the affected * bonding modes and the retransmission is enabled */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } } /** * bond_select_active_slave - select a new active slave, if needed * @bond: our bonding struct * * This functions should be called when one of the following occurs: * - The old curr_active_slave has been released or lost its link. * - The primary_slave has got its link back. * - A slave has got its link back and there's no old curr_active_slave. * * Caller must hold RTNL. */ void bond_select_active_slave(struct bonding *bond) { struct slave *best_slave; int rv; ASSERT_RTNL(); best_slave = bond_find_best_slave(bond); if (best_slave != rtnl_dereference(bond->curr_active_slave)) { bond_change_active_slave(bond, best_slave); rv = bond_set_carrier(bond); if (!rv) return; if (netif_carrier_ok(bond->dev)) netdev_info(bond->dev, "active interface up!\n"); else netdev_info(bond->dev, "now running without any active interface!\n"); } } #ifdef CONFIG_NET_POLL_CONTROLLER static inline int slave_enable_netpoll(struct slave *slave) { struct netpoll *np; int err = 0; np = kzalloc(sizeof(*np), GFP_KERNEL); err = -ENOMEM; if (!np) goto out; err = __netpoll_setup(np, slave->dev); if (err) { kfree(np); goto out; } slave->np = np; out: return err; } static inline void slave_disable_netpoll(struct slave *slave) { struct netpoll *np = slave->np; if (!np) return; slave->np = NULL; __netpoll_free(np); } static void bond_poll_controller(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg && agg->aggregator_identifier != ad_info.aggregator_id) continue; } netpoll_poll_dev(slave->dev); } } static void bond_netpoll_cleanup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) if (bond_slave_is_up(slave)) slave_disable_netpoll(slave); } static int bond_netpoll_setup(struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave; int err = 0; bond_for_each_slave(bond, slave, iter) { err = slave_enable_netpoll(slave); if (err) { bond_netpoll_cleanup(dev); break; } } return err; } #else static inline int slave_enable_netpoll(struct slave *slave) { return 0; } static inline void slave_disable_netpoll(struct slave *slave) { } static void bond_netpoll_cleanup(struct net_device *bond_dev) { } #endif /*---------------------------------- IOCTL ----------------------------------*/ static netdev_features_t bond_fix_features(struct net_device *dev, netdev_features_t features) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; netdev_features_t mask; struct slave *slave; mask = features; features = netdev_base_features(features); bond_for_each_slave(bond, slave, iter) { features = netdev_increment_features(features, slave->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } #define BOND_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_PARTIAL) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) #define BOND_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) static void bond_compute_features(struct bonding *bond) { netdev_features_t gso_partial_features = BOND_GSO_PARTIAL_FEATURES; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD netdev_features_t xfrm_features = BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ netdev_features_t mpls_features = BOND_MPLS_FEATURES; struct net_device *bond_dev = bond->dev; struct list_head *iter; struct slave *slave; unsigned short max_hard_header_len = ETH_HLEN; unsigned int tso_max_size = TSO_MAX_SIZE; u16 tso_max_segs = TSO_MAX_SEGS; if (!bond_has_slaves(bond)) goto done; vlan_features = netdev_base_features(vlan_features); mpls_features = netdev_base_features(mpls_features); bond_for_each_slave(bond, slave, iter) { vlan_features = netdev_increment_features(vlan_features, slave->dev->vlan_features, BOND_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, slave->dev->hw_enc_features, BOND_ENC_FEATURES); #ifdef CONFIG_XFRM_OFFLOAD xfrm_features = netdev_increment_features(xfrm_features, slave->dev->hw_enc_features, BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ gso_partial_features = netdev_increment_features(gso_partial_features, slave->dev->gso_partial_features, BOND_GSO_PARTIAL_FEATURES); mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, BOND_MPLS_FEATURES); dst_release_flag &= slave->dev->priv_flags; if (slave->dev->hard_header_len > max_hard_header_len) max_hard_header_len = slave->dev->hard_header_len; tso_max_size = min(tso_max_size, slave->dev->tso_max_size); tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs); } bond_dev->hard_header_len = max_hard_header_len; done: bond_dev->gso_partial_features = gso_partial_features; bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_enc_features |= xfrm_features; #endif /* CONFIG_XFRM_OFFLOAD */ bond_dev->mpls_features = mpls_features; netif_set_tso_max_segs(bond_dev, tso_max_segs); netif_set_tso_max_size(bond_dev, tso_max_size); bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { bool was_up = !!(bond_dev->flags & IFF_UP); dev_close(bond_dev); bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len; memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); if (slave_dev->flags & IFF_POINTOPOINT) { bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } if (was_up) dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress * duplicates except for alb non-mcast/bcast. */ static bool bond_should_deliver_exact_match(struct sk_buff *skb, struct slave *slave, struct bonding *bond) { if (bond_is_slave_inactive(slave)) { if (BOND_MODE(bond) == BOND_MODE_ALB && skb->pkt_type != PACKET_BROADCAST && skb->pkt_type != PACKET_MULTICAST) return false; return true; } return false; } static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct slave *slave; struct bonding *bond; int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); int ret = RX_HANDLER_ANOTHER; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return RX_HANDLER_CONSUMED; *pskb = skb; slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; recv_probe = READ_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { consume_skb(skb); return ret; } } /* * For packets determined by bond_should_deliver_exact_match() call to * be suppressed we want to make an exception for link-local packets. * This is necessary for e.g. LLDP daemons to be able to monitor * inactive slave links without being forced to bind to them * explicitly. * * At the same time, packets that are passed to the bonding master * (including link-local ones) can have their originating interface * determined via PACKET_ORIGDEV socket option. */ if (bond_should_deliver_exact_match(skb, slave, bond)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS; return RX_HANDLER_EXACT; } skb->dev = bond->dev; if (BOND_MODE(bond) == BOND_MODE_ALB && netif_is_bridge_port(bond->dev) && skb->pkt_type == PACKET_HOST) { if (unlikely(skb_cow_head(skb, skb->data - skb_mac_header(skb)))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, bond->dev->addr_len); } return ret; } static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return NETDEV_LAG_TX_TYPE_ROUNDROBIN; case BOND_MODE_ACTIVEBACKUP: return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; case BOND_MODE_BROADCAST: return NETDEV_LAG_TX_TYPE_BROADCAST; case BOND_MODE_XOR: case BOND_MODE_8023AD: return NETDEV_LAG_TX_TYPE_HASH; default: return NETDEV_LAG_TX_TYPE_UNKNOWN; } } static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, enum netdev_lag_tx_type type) { if (type != NETDEV_LAG_TX_TYPE_HASH) return NETDEV_LAG_HASH_NONE; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_LAYER2: return NETDEV_LAG_HASH_L2; case BOND_XMIT_POLICY_LAYER34: return NETDEV_LAG_HASH_L34; case BOND_XMIT_POLICY_LAYER23: return NETDEV_LAG_HASH_L23; case BOND_XMIT_POLICY_ENCAP23: return NETDEV_LAG_HASH_E23; case BOND_XMIT_POLICY_ENCAP34: return NETDEV_LAG_HASH_E34; case BOND_XMIT_POLICY_VLAN_SRCMAC: return NETDEV_LAG_HASH_VLAN_SRCMAC; default: return NETDEV_LAG_HASH_UNKNOWN; } } static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; enum netdev_lag_tx_type type; int err; type = bond_lag_tx_type(bond); lag_upper_info.tx_type = type; lag_upper_info.hash_type = bond_lag_hash_type(bond, type); err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave, &lag_upper_info, extack); if (err) return err; slave->dev->flags |= IFF_SLAVE; return 0; } static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) { netdev_upper_dev_unlink(slave->dev, bond->dev); slave->dev->flags &= ~IFF_SLAVE; } static void slave_kobj_release(struct kobject *kobj) { struct slave *slave = to_slave(kobj); struct bonding *bond = bond_get_bond_by_slave(slave); cancel_delayed_work_sync(&slave->notify_work); if (BOND_MODE(bond) == BOND_MODE_8023AD) kfree(SLAVE_AD_INFO(slave)); kfree(slave); } static struct kobj_type slave_ktype = { .release = slave_kobj_release, #ifdef CONFIG_SYSFS .sysfs_ops = &slave_sysfs_ops, #endif }; static int bond_kobj_init(struct slave *slave) { int err; err = kobject_init_and_add(&slave->kobj, &slave_ktype, &(slave->dev->dev.kobj), "bonding_slave"); if (err) kobject_put(&slave->kobj); return err; } static struct slave *bond_alloc_slave(struct bonding *bond, struct net_device *slave_dev) { struct slave *slave = NULL; slave = kzalloc(sizeof(*slave), GFP_KERNEL); if (!slave) return NULL; slave->bond = bond; slave->dev = slave_dev; INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work); if (bond_kobj_init(slave)) return NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD) { SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info), GFP_KERNEL); if (!SLAVE_AD_INFO(slave)) { kobject_put(&slave->kobj); return NULL; } } return slave; } static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info) { info->bond_mode = BOND_MODE(bond); info->miimon = bond->params.miimon; info->num_slaves = bond->slave_cnt; } static void bond_fill_ifslave(struct slave *slave, struct ifslave *info) { strcpy(info->slave_name, slave->dev->name); info->link = slave->link; info->state = bond_slave_state(slave); info->link_failure_count = slave->link_failure_count; } static void bond_netdev_notify_work(struct work_struct *_work) { struct slave *slave = container_of(_work, struct slave, notify_work.work); if (rtnl_trylock()) { struct netdev_bonding_info binfo; bond_fill_ifslave(slave, &binfo.slave); bond_fill_ifbond(slave->bond, &binfo.master); netdev_bonding_info_change(slave->dev, &binfo); rtnl_unlock(); } else { queue_delayed_work(slave->bond->wq, &slave->notify_work, 1); } } void bond_queue_slave_event(struct slave *slave) { queue_delayed_work(slave->bond->wq, &slave->notify_work, 0); } void bond_lower_state_changed(struct slave *slave) { struct netdev_lag_lower_state_info info; info.link_up = slave->link == BOND_LINK_UP || slave->link == BOND_LINK_FAIL; info.tx_enabled = bond_is_active_slave(slave); netdev_lower_state_changed(slave->dev, &info); } #define BOND_NL_ERR(bond_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ netdev_err(bond_dev, "Error: %s\n", errmsg); \ } while (0) #define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg); \ } while (0) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } void bond_xdp_set_features(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); xdp_features_t val = NETDEV_XDP_ACT_MASK; struct list_head *iter; struct slave *slave; ASSERT_RTNL(); if (!bond_xdp_check(bond, BOND_MODE(bond)) || !bond_has_slaves(bond)) { xdp_clear_features_flag(bond_dev); return; } bond_for_each_slave(bond, slave, iter) val &= slave->dev->xdp_features; val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY; xdp_set_features_flag(bond_dev, val); } /* enslave device <slave> to bond device <master> */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; int link_reporting; int res = 0, i; if (slave_dev->flags & IFF_MASTER && !netif_is_bond_master(slave_dev)) { BOND_NL_ERR(bond_dev, extack, "Device type (master device) cannot be enslaved"); return -EPERM; } if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && slave_ops->ndo_eth_ioctl == NULL) { slave_warn(bond_dev, slave_dev, "no link monitoring support\n"); } /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device is in use and cannot be enslaved"); return -EBUSY; } if (bond_dev == slave_dev) { BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself."); return -EPERM; } /* vlan challenged mutual exclusion */ /* no need to lock since we're protected by rtnl_lock */ if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) { slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n"); if (vlan_uses_dev(bond_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Can not enslave VLAN challenged device to VLAN enabled bond"); return -EPERM; } else { slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n"); } } else { slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n"); } if (slave_dev->features & NETIF_F_HW_ESP) slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n"); /* Old ifenslave binaries are no longer supported. These can * be identified with moderate accuracy by the state of the slave: * the current ifenslave will set the interface down prior to * enslaving it; the old ifenslave will not. */ if (slave_dev->flags & IFF_UP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device can not be enslaved while up"); return -EPERM; } /* set bonding device ether type by slave - bonding netdevices are * created with ether_setup, so when the slave type is not ARPHRD_ETHER * there is a need to override some of the type dependent attribs/funcs. * * bond ether type mutual exclusion - don't allow slaves of dissimilar * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond */ if (!bond_has_slaves(bond)) { if (bond_dev->type != slave_dev->type) { slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n", bond_dev->type, slave_dev->type); res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, bond_dev); res = notifier_to_errno(res); if (res) { slave_err(bond_dev, slave_dev, "refused to change device type\n"); return -EBUSY; } /* Flush unicast and multicast addresses */ dev_uc_flush(bond_dev); dev_mc_flush(bond_dev); if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); else bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); } } else if (bond_dev->type != slave_dev->type) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device type is different from other slaves"); return -EINVAL; } if (slave_dev->type == ARPHRD_INFINIBAND && BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Only active-backup mode is supported for infiniband slaves"); res = -EOPNOTSUPP; goto err_undo_flags; } if (!slave_ops->ndo_set_mac_address || slave_dev->type == ARPHRD_INFINIBAND) { slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n"); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac != BOND_FOM_ACTIVE) { if (!bond_has_slaves(bond)) { bond->params.fail_over_mac = BOND_FOM_ACTIVE; slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n"); } else { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active"); res = -EOPNOTSUPP; goto err_undo_flags; } } } call_netdevice_notifiers(NETDEV_JOIN, slave_dev); /* If this is the first slave, then we need to set the master's hardware * address to be the same as the slave's. */ if (!bond_has_slaves(bond) && bond->dev->addr_assign_type == NET_ADDR_RANDOM) { res = bond_set_dev_addr(bond->dev, slave_dev); if (res) goto err_undo_flags; } new_slave = bond_alloc_slave(bond, slave_dev); if (!new_slave) { res = -ENOMEM; goto err_undo_flags; } /* Set the new_slave's queue_id to be zero. Queue ID mapping * is set via sysfs or module option if desired. */ new_slave->queue_id = 0; /* Save slave's original mtu and then set it to match the bond */ new_slave->original_mtu = slave_dev->mtu; res = dev_set_mtu(slave_dev, bond->dev->mtu); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res); goto err_free; } /* Save slave's original ("permanent") mac address for modes * that need it, and for restoring it upon release, and then * set it to the master's address */ bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr, slave_dev->addr_len); if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* Set slave to master's mac address. The application already * set the master's mac address to that of the first slave */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. */ eth_random_addr(ss.__data); } else { goto skip_mac_set; } ss.ss_family = slave_dev->type; res = dev_set_mac_address(slave_dev, &ss, extack); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; } skip_mac_set: /* set no_addrconf flag before open to prevent IPv6 addrconf */ slave_dev->priv_flags |= IFF_NO_ADDRCONF; /* open the slave since the application closed it */ res = dev_open(slave_dev, extack); if (res) { slave_err(bond_dev, slave_dev, "Opening slave failed\n"); goto err_restore_mac; } slave_dev->priv_flags |= IFF_BONDING; /* initialize slave stats */ dev_get_stats(new_slave->dev, &new_slave->slave_stats); if (bond_is_lb(bond)) { /* bond_alb_init_slave() must be called before all other stages since * it might fail and we do not want to have to undo everything */ res = bond_alb_init_slave(bond, new_slave); if (res) goto err_close; } res = vlan_vids_add_by_dev(slave_dev, bond_dev); if (res) { slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n"); goto err_close; } prev_slave = bond_last_slave(bond); new_slave->delay = 0; new_slave->link_failure_count = 0; if (bond_update_speed_duplex(new_slave) && bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) new_slave->target_last_arp_rx[i] = new_slave->last_rx; new_slave->last_tx = new_slave->last_rx; if (bond->params.miimon && !bond->params.use_carrier) { link_reporting = bond_check_dev_link(bond, slave_dev, 1); if ((link_reporting == -1) && !bond->params.arp_interval) { /* miimon is set but a bonded network driver * does not support ETHTOOL/MII and * arp_interval is not set. Note: if * use_carrier is enabled, we will never go * here (because netif_carrier is always * supported); thus, we don't need to change * the messages for netif_carrier. */ slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n"); } else if (link_reporting == -1) { /* unable get link status using mii/ethtool */ slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n"); } } /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_NOW); new_slave->delay = bond->params.updelay; } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } } else { bond_set_slave_link_state(new_slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); } } else if (bond->params.arp_interval) { bond_set_slave_link_state(new_slave, (netif_carrier_ok(slave_dev) ? BOND_LINK_UP : BOND_LINK_DOWN), BOND_SLAVE_NOTIFY_NOW); } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } if (new_slave->link != BOND_LINK_DOWN) new_slave->last_link_up = jiffies; slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n", new_slave->link == BOND_LINK_DOWN ? "DOWN" : (new_slave->link == BOND_LINK_UP ? "UP" : "BACK")); if (bond_uses_primary(bond) && bond->params.primary[0]) { /* if there is a primary slave, remember it */ if (strcmp(bond->params.primary, new_slave->dev->name) == 0) { rcu_assign_pointer(bond->primary_slave, new_slave); bond->force_primary = true; } } switch (BOND_MODE(bond)) { case BOND_MODE_ACTIVEBACKUP: bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; case BOND_MODE_8023AD: /* in 802.3ad mode, the internal mechanism * will activate the slaves in the selected * aggregator */ bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); /* if this is the first slave */ if (!prev_slave) { SLAVE_AD_INFO(new_slave)->id = 1; /* Initialize AD with the number of times that the AD timer is called in 1 second * can be called only after the mac address of the bond is set */ bond_3ad_initialize(bond); } else { SLAVE_AD_INFO(new_slave)->id = SLAVE_AD_INFO(prev_slave)->id + 1; } bond_3ad_bind_slave(new_slave); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_set_active_slave(new_slave); bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; default: slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n"); /* always active in trunk mode */ bond_set_active_slave(new_slave); /* In trunking mode there is little meaning to curr_active_slave * anyway (it holds no special properties of the bond device), * so we can change it without calling change_active_interface() */ if (!rcu_access_pointer(bond->curr_active_slave) && new_slave->link == BOND_LINK_UP) rcu_assign_pointer(bond->curr_active_slave, new_slave); break; } /* switch(bond_mode) */ #ifdef CONFIG_NET_POLL_CONTROLLER if (bond->dev->npinfo) { if (slave_enable_netpoll(new_slave)) { slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n"); res = -EBUSY; goto err_detach; } } #endif if (!(bond_dev->features & NETIF_F_LRO)) dev_disable_lro(slave_dev); res = netdev_rx_handler_register(slave_dev, bond_handle_frame, new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res); goto err_detach; } res = bond_master_upper_dev_link(bond, new_slave, extack); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res); goto err_unregister; } bond_lower_state_changed(new_slave); res = bond_sysfs_slave_add(new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res); goto err_upper_unlink; } /* If the mode uses primary, then the following is handled by * bond_change_active_slave(). */ if (!bond_uses_primary(bond)) { /* set promiscuity level to new slave */ if (bond_dev->flags & IFF_PROMISC) { res = dev_set_promiscuity(slave_dev, 1); if (res) goto err_sysfs_del; } /* set allmulti level to new slave */ if (bond_dev->flags & IFF_ALLMULTI) { res = dev_set_allmulti(slave_dev, 1); if (res) { if (bond_dev->flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); goto err_sysfs_del; } } if (bond_dev->flags & IFF_UP) { netif_addr_lock_bh(bond_dev); dev_mc_sync_multiple(slave_dev, bond_dev); dev_uc_sync_multiple(slave_dev, bond_dev); netif_addr_unlock_bh(bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_add(slave_dev, lacpdu_mcast_addr); } } bond->slave_cnt++; bond_compute_features(bond); bond_set_carrier(bond); /* Needs to be called before bond_select_active_slave(), which will * remove the maddrs if the slave is selected as active slave. */ bond_slave_ns_maddrs_add(bond, new_slave); if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave does not support XDP"); res = -EOPNOTSUPP; goto err_sysfs_del; } } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = bond->xdp_prog, .extack = extack, }; if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); res = -EOPNOTSUPP; goto err_sysfs_del; } res = dev_xdp_propagate(slave_dev, &xdp); if (res < 0) { /* ndo_bpf() sets extack error message */ slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res); goto err_sysfs_del; } if (bond->xdp_prog) bpf_prog_inc(bond->xdp_prog); } bond_xdp_set_features(bond_dev); slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n", bond_is_active_slave(new_slave) ? "an active" : "a backup", new_slave->link != BOND_LINK_DOWN ? "an up" : "a down"); /* enslave is successful */ bond_queue_slave_event(new_slave); return 0; /* Undo stages on error */ err_sysfs_del: bond_sysfs_slave_del(new_slave); err_upper_unlink: bond_upper_dev_unlink(bond, new_slave); err_unregister: netdev_rx_handler_unregister(slave_dev); err_detach: vlan_vids_del_by_dev(slave_dev, bond_dev); if (rcu_access_pointer(bond->primary_slave) == new_slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (rcu_access_pointer(bond->curr_active_slave) == new_slave) { block_netpoll_tx(); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); unblock_netpoll_tx(); } /* either primary_slave or curr_active_slave might've changed */ synchronize_rcu(); slave_disable_netpoll(new_slave); err_close: if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; dev_close(slave_dev); err_restore_mac: slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* XXX TODO - fom follow mode needs to change master's * MAC if this slave's MAC is in use by the bond, or at * least print a warning. */ bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, &ss, NULL); } err_restore_mtu: dev_set_mtu(slave_dev, new_slave->original_mtu); err_free: kobject_put(&new_slave->kobj); err_undo_flags: /* Enslave of first slave has failed and we need to fix master's mac */ if (!bond_has_slaves(bond)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave_dev->dev_addr)) eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); bond_ether_setup(bond_dev); } } return res; } /* Try to release the slave device <slave> from the bond device <master> * It is legal to access curr_active_slave without a lock because all the function * is RTNL-locked. If "all" is true it means that the function is being called * while destroying a bond interface and all slaves are being released. * * The rules for slave state should be: * for Active/Backup: * Active stays on all backups go down * for Bonded connections: * The first up interface should be left on and all others downed. */ static int __bond_release_one(struct net_device *bond_dev, struct net_device *slave_dev, bool all, bool unregister) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *oldcurrent; struct sockaddr_storage ss; int old_flags = bond_dev->flags; netdev_features_t old_features = bond_dev->features; /* slave is not a slave or master is not master of this slave */ if (!(slave_dev->flags & IFF_SLAVE) || !netdev_has_upper_dev(slave_dev, bond_dev)) { slave_dbg(bond_dev, slave_dev, "cannot release slave\n"); return -EINVAL; } block_netpoll_tx(); slave = bond_get_slave_by_dev(bond, slave_dev); if (!slave) { /* not a slave of this bond */ slave_info(bond_dev, slave_dev, "interface not enslaved\n"); unblock_netpoll_tx(); return -EINVAL; } bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); bond_sysfs_slave_del(slave); /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats); if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = NULL, .extack = NULL, }; if (dev_xdp_propagate(slave_dev, &xdp)) slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n"); } /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ netdev_rx_handler_unregister(slave_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave); bond_upper_dev_unlink(bond, slave); if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave); slave_info(bond_dev, slave_dev, "Releasing %s interface\n", bond_is_active_slave(slave) ? "active" : "backup"); oldcurrent = rcu_access_pointer(bond->curr_active_slave); RCU_INIT_POINTER(bond->current_arp_slave, NULL); if (!all && (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n", slave->perm_hwaddr); } if (rtnl_dereference(bond->primary_slave) == slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (oldcurrent == slave) bond_change_active_slave(bond, NULL); /* Must be called after bond_change_active_slave () as the slave * might change from an active slave to a backup slave. Then it is * necessary to clear the maddrs on the backup slave. */ bond_slave_ns_maddrs_del(bond, slave); if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave * has been cleared (if our_slave == old_current), * but before a new active slave is selected. */ bond_alb_deinit_slave(bond, slave); } if (all) { RCU_INIT_POINTER(bond->curr_active_slave, NULL); } else if (oldcurrent == slave) { /* Note that we hold RTNL over this sequence, so there * is no concern that another slave add/remove event * will interfere. */ bond_select_active_slave(bond); } bond_set_carrier(bond); if (!bond_has_slaves(bond)) eth_hw_addr_random(bond_dev); unblock_netpoll_tx(); synchronize_rcu(); bond->slave_cnt--; if (!bond_has_slaves(bond)) { call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev); call_netdevice_notifiers(NETDEV_RELEASE, bond->dev); } bond_compute_features(bond); if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) && (old_features & NETIF_F_VLAN_CHALLENGED)) slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n"); vlan_vids_del_by_dev(slave_dev, bond_dev); /* If the mode uses primary, then this case was handled above by * bond_change_active_slave(..., NULL) */ if (!bond_uses_primary(bond)) { /* unset promiscuity level from slave * NOTE: The NETDEV_CHANGEADDR call above may change the value * of the IFF_PROMISC flag in the bond_dev, but we need the * value of that flag before that change, as that was the value * when this slave was attached, so we cache at the start of the * function and use it here. Same goes for ALLMULTI below */ if (old_flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); /* unset allmulti level from slave */ if (old_flags & IFF_ALLMULTI) dev_set_allmulti(slave_dev, -1); if (old_flags & IFF_UP) bond_hw_addr_flush(bond_dev, slave_dev); } slave_disable_netpoll(slave); /* close slave before restoring its mac address */ dev_close(slave_dev); slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, &ss, NULL); } if (unregister) { netdev_lock_ops(slave_dev); __dev_set_mtu(slave_dev, slave->original_mtu); netdev_unlock_ops(slave_dev); } else { dev_set_mtu(slave_dev, slave->original_mtu); } if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; bond_xdp_set_features(bond_dev); kobject_put(&slave->kobj); return 0; } /* A wrapper used because of ndo_del_link */ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) { return __bond_release_one(bond_dev, slave_dev, false, false); } /* First release a slave and then destroy the bond if no more slaves are left. * Must be under rtnl_lock when this function is called. */ static int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); int ret; ret = __bond_release_one(bond_dev, slave_dev, false, true); if (ret == 0 && !bond_has_slaves(bond) && bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond\n"); bond_remove_proc_entry(bond); unregister_netdevice(bond_dev); } return ret; } static void bond_info_query(struct net_device *bond_dev, struct ifbond *info) { struct bonding *bond = netdev_priv(bond_dev); bond_fill_ifbond(bond, info); } static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; int i = 0, res = -ENODEV; struct slave *slave; bond_for_each_slave(bond, slave, iter) { if (i++ == (int)info->slave_id) { res = 0; bond_fill_ifslave(slave, info); break; } } return res; } /*-------------------------------- Monitoring -------------------------------*/ /* called with rcu_read_lock() */ static int bond_miimon_inspect(struct bonding *bond) { bool ignore_updelay = false; int link_state, commit = 0; struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { ignore_updelay = !rcu_dereference(bond->curr_active_slave); } else { struct bond_up_slave *usable_slaves; usable_slaves = rcu_dereference(bond->usable_slaves); if (usable_slaves && usable_slaves->count == 0) ignore_updelay = true; } bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = bond_check_dev_link(bond, slave->dev, 0); switch (slave->link) { case BOND_LINK_UP: if (link_state) continue; bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; slave->delay = bond->params.downdelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n", (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) ? (bond_is_active_slave(slave) ? "active " : "backup ") : "", bond->params.downdelay * bond->params.miimon); } fallthrough; case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status up again after %d ms\n", (bond->params.downdelay - slave->delay) * bond->params.miimon); commit++; continue; } if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } slave->delay--; break; case BOND_LINK_DOWN: if (!link_state) continue; bond_propose_link_state(slave, BOND_LINK_BACK); commit++; slave->delay = bond->params.updelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n", ignore_updelay ? 0 : bond->params.updelay * bond->params.miimon); } fallthrough; case BOND_LINK_BACK: if (!link_state) { bond_propose_link_state(slave, BOND_LINK_DOWN); if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status down again after %d ms\n", (bond->params.updelay - slave->delay) * bond->params.miimon); commit++; continue; } if (ignore_updelay) slave->delay = 0; if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; } slave->delay--; break; } } return commit; } static void bond_miimon_link_change(struct bonding *bond, struct slave *slave, char link) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: bond_3ad_handle_link_change(slave, link); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_alb_handle_link_change(bond, slave, link); break; case BOND_MODE_XOR: bond_update_slave_arr(bond, NULL); break; } } static void bond_miimon_commit(struct bonding *bond) { struct slave *slave, *primary, *active; bool do_failover = false; struct list_head *iter; ASSERT_RTNL(); bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after * invalid speed/duplex reporting but recovered before * link monitoring could make a decision on the actual * link status */ if (BOND_MODE(bond) == BOND_MODE_8023AD && slave->link == BOND_LINK_UP) bond_3ad_adapter_speed_duplex_changed(slave); continue; case BOND_LINK_UP: if (bond_update_speed_duplex(slave) && bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; if (net_ratelimit()) slave_warn(bond->dev, slave->dev, "failed to get link speed/duplex\n"); continue; } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; primary = rtnl_dereference(bond->primary_slave); if (BOND_MODE(bond) == BOND_MODE_8023AD) { /* prevent it from being the active one */ bond_set_backup_slave(slave); } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* make it immediately active */ bond_set_active_slave(slave); } slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n", slave->speed == SPEED_UNKNOWN ? 0 : slave->speed, slave->duplex ? "full" : "half"); bond_miimon_link_change(bond, slave, BOND_LINK_UP); active = rtnl_dereference(bond->curr_active_slave); if (!active || slave == primary || slave->prio > active->prio) do_failover = true; continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_8023AD) bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); bond_miimon_link_change(bond, slave, BOND_LINK_DOWN); if (slave == rcu_access_pointer(bond->curr_active_slave)) do_failover = true; continue; default: slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n", slave->link_new_state); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* bond_mii_monitor * * Really a wrapper that splits the mii monitor into two phases: an * inspection, then (if inspection indicates something needs to be done) * an acquisition of appropriate locks followed by a commit phase to * implement whatever link state changes are indicated. */ static void bond_mii_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mii_work.work); bool should_notify_peers = false; bool commit; unsigned long delay; struct slave *slave; struct list_head *iter; delay = msecs_to_jiffies(bond->params.miimon); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); commit = !!bond_miimon_inspect(bond); if (bond->send_peer_notif) { rcu_read_unlock(); if (rtnl_trylock()) { bond->send_peer_notif--; rtnl_unlock(); } } else { rcu_read_unlock(); } if (commit) { /* Race avoidance with bond_close cancel of workqueue */ if (!rtnl_trylock()) { delay = 1; should_notify_peers = false; goto re_arm; } bond_for_each_slave(bond, slave, iter) { bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); } bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ } re_arm: if (bond->params.miimon) queue_delayed_work(bond->wq, &bond->mii_work, delay); if (should_notify_peers) { if (!rtnl_trylock()) return; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } static int bond_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { struct netdev_nested_priv priv = { .data = (void *)&ip, }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); return ret; } #define BOND_VLAN_PROTO_NONE cpu_to_be16(0xffff) static bool bond_handle_vlan(struct slave *slave, struct bond_vlan_tag *tags, struct sk_buff *skb) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct bond_vlan_tag *outer_tag = tags; if (!tags || tags->vlan_proto == BOND_VLAN_PROTO_NONE) return true; tags++; /* Go through all the tags backwards and add them to the packet */ while (tags->vlan_proto != BOND_VLAN_PROTO_NONE) { if (!tags->vlan_id) { tags++; continue; } slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), tags->vlan_id); skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto, tags->vlan_id); if (!skb) { net_err_ratelimited("failed to insert inner VLAN tag\n"); return false; } tags++; } /* Set the outer tag */ if (outer_tag->vlan_id) { slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), outer_tag->vlan_id); __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto, outer_tag->vlan_id); } return true; } /* We go to the (large) trouble of VLAN tagging ARP frames because * switches in VLAN mode (especially if ports are configured as * "native" to a VLAN) might not pass non-tagged frames. */ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, __be32 src_ip, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n", arp_op, &dest_ip, &src_ip); skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip, NULL, slave_dev->dev_addr, NULL); if (!skb) { net_err_ratelimited("ARP packet allocation failed\n"); return; } if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); arp_xmit(skb); } return; } /* Validate the device path between the @start_dev and the @end_dev. * The path is valid if the @end_dev is reachable through device * stacking. * When the path is validated, collect any vlan information in the * path. */ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level) { struct bond_vlan_tag *tags; struct net_device *upper; struct list_head *iter; if (start_dev == end_dev) { tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC); if (!tags) return ERR_PTR(-ENOMEM); tags[level].vlan_proto = BOND_VLAN_PROTO_NONE; return tags; } netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { tags = bond_verify_device_path(upper, end_dev, level + 1); if (IS_ERR_OR_NULL(tags)) { if (IS_ERR(tags)) return tags; continue; } if (is_vlan_dev(upper)) { tags[level].vlan_proto = vlan_dev_vlan_proto(upper); tags[level].vlan_id = vlan_dev_vlan_id(upper); } return tags; } return NULL; } static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { struct rtable *rt; struct bond_vlan_tag *tags; __be32 *targets = bond->params.arp_targets, addr; int i; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) { /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); bond_arp_send(slave, ARPOP_REQUEST, targets[i], 0, tags); continue; } /* bond device itself */ if (rt->dst.dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n", &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL"); ip_rt_put(rt); continue; found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags); kfree(tags); } } static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip) { int i; if (!sip || !bond_has_this_ip(bond, tip)) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n", __func__, &sip, &tip); return; } i = bond_get_targets_ip(bond->params.arp_targets, sip); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n", __func__, &sip); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arphdr *arp = (struct arphdr *)skb->data; struct slave *curr_active_slave, *curr_arp_slave; unsigned char *arp_ptr; __be32 sip, tip; unsigned int alen; alen = arp_hdr_len(bond->dev); if (alen > skb_headlen(skb)) { arp = kmalloc(alen, GFP_ATOMIC); if (!arp) goto out_unlock; if (skb_copy_bits(skb, 0, arp, alen) < 0) goto out_unlock; } if (arp->ar_hln != bond->dev->addr_len || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK || arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4) goto out_unlock; arp_ptr = (unsigned char *)(arp + 1); arp_ptr += bond->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4 + bond->dev->addr_len; memcpy(&tip, arp_ptr, 4); slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), &sip, &tip); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * * (a) the slave receiving the ARP is active (which includes the * current ARP slave, if any), or * * (b) the receiving slave isn't active, but there is a currently * active slave and it received valid arp reply(s) after it became * the currently active slave, or * * (c) there is an ARP slave that sent an ARP during the prior ARP * interval, and we receive an ARP reply on any slave. We accept * these because switch FDB update delays may deliver the ARP * reply to a slave other than the sender of the ARP request. * * Note: for (b), backup slaves are receiving the broadcast ARP * request, not a reply. This request passes from the sending * slave through the L2 switch(es) to the receiving slave. Since * this is checking the request, sip/tip are swapped for * validation. * * This is done to avoid endless looping when we can't reach the * arp_ip_target and fool ourselves with our own arp requests. */ if (bond_is_active_slave(slave)) bond_validate_arp(bond, slave, sip, tip); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_arp(bond, slave, tip, sip); else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_arp(bond, slave, sip, tip); out_unlock: if (arp != (struct arphdr *)skb->data) kfree(arp); return RX_HANDLER_ANOTHER; } #if IS_ENABLED(CONFIG_IPV6) static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr, const struct in6_addr *saddr, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct in6_addr mcaddr; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "NS on slave: dst %pI6c src %pI6c\n", daddr, saddr); skb = ndisc_ns_create(slave_dev, daddr, saddr, 0); if (!skb) { net_err_ratelimited("NS packet allocation failed\n"); return; } addrconf_addr_solict_mult(daddr, &mcaddr); if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); ndisc_send_skb(skb, &mcaddr, saddr); } } static void bond_ns_send_all(struct bonding *bond, struct slave *slave) { struct in6_addr *targets = bond->params.ns_targets; struct bond_vlan_tag *tags; struct dst_entry *dst; struct in6_addr saddr; struct flowi6 fl6; int i; for (i = 0; i < BOND_MAX_NS_TARGETS && !ipv6_addr_any(&targets[i]); i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI6c\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { dst_release(dst); /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to ns_ip6_target %pI6c and arp_validate is set\n", bond->dev->name, &targets[i]); bond_ns_send(slave, &targets[i], &in6addr_any, tags); continue; } /* bond device itself */ if (dst->dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, dst->dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to ns_ip6_target %pI6c via dst->dev %s\n", &targets[i], dst->dev ? dst->dev->name : "NULL"); dst_release(dst); continue; found: if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr)) bond_ns_send(slave, &targets[i], &saddr, tags); else bond_ns_send(slave, &targets[i], &in6addr_any, tags); dst_release(dst); kfree(tags); } } static int bond_confirm_addr6(struct net_device *dev, struct netdev_nested_priv *priv) { struct in6_addr *addr = (struct in6_addr *)priv->data; return ipv6_chk_addr(dev_net(dev), addr, dev, 0); } static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr) { struct netdev_nested_priv priv = { .data = addr, }; int ret = false; if (bond_confirm_addr6(bond->dev, &priv)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_confirm_addr6, &priv)) ret = true; rcu_read_unlock(); return ret; } static void bond_validate_na(struct bonding *bond, struct slave *slave, struct in6_addr *saddr, struct in6_addr *daddr) { int i; /* Ignore NAs that: * 1. Source address is unspecified address. * 2. Dest address is neither all-nodes multicast address nor * exist on bond interface. */ if (ipv6_addr_any(saddr) || (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) && !bond_has_this_ip6(bond, daddr))) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n", __func__, saddr, daddr); return; } i = bond_get_targets_ip6(bond->params.ns_targets, saddr); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c not found in targets\n", __func__, saddr); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct slave *curr_active_slave, *curr_arp_slave; struct in6_addr *saddr, *daddr; struct { struct ipv6hdr ip6; struct icmp6hdr icmp6; } *combined, _combined; if (skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto out; combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) goto out; saddr = &combined->ip6.saddr; daddr = &combined->ip6.daddr; slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), saddr, daddr); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * see bond_arp_rcv(). */ if (bond_is_active_slave(slave)) bond_validate_na(bond, slave, saddr, daddr); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_na(bond, slave, daddr, saddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_na(bond, slave, saddr, daddr); out: return RX_HANDLER_ANOTHER; } #endif int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { #if IS_ENABLED(CONFIG_IPV6) bool is_ipv6 = skb->protocol == __cpu_to_be16(ETH_P_IPV6); #endif bool is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n", __func__, skb->dev->name); /* Use arp validate logic for both ARP and NS */ if (!slave_do_arp_validate(bond, slave)) { if ((slave_do_arp_validate_only(bond) && is_arp) || #if IS_ENABLED(CONFIG_IPV6) (slave_do_arp_validate_only(bond) && is_ipv6) || #endif !slave_do_arp_validate_only(bond)) slave->last_rx = jiffies; return RX_HANDLER_ANOTHER; } else if (is_arp) { return bond_arp_rcv(skb, bond, slave); #if IS_ENABLED(CONFIG_IPV6) } else if (is_ipv6) { return bond_na_rcv(skb, bond, slave); #endif } else { return RX_HANDLER_ANOTHER; } } static void bond_send_validate(struct bonding *bond, struct slave *slave) { bond_arp_send_all(bond, slave); #if IS_ENABLED(CONFIG_IPV6) bond_ns_send_all(bond, slave); #endif } /* function to verify if we're in the arp_interval timeslice, returns true if * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval + * arp_interval/2) . the arp_interval/2 is needed for really fast networks. */ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod) { int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); return time_in_range(jiffies, last_act - delta_in_ticks, last_act + mod * delta_in_ticks + delta_in_ticks/2); } /* This function is called regularly to monitor each slave's link * ensuring that traffic is being sent and received when arp monitoring * is used in load-balancing mode. if the adapter has been dormant, then an * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ static void bond_loadbalance_arp_mon(struct bonding *bond) { struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); oldcurrent = rcu_dereference(bond->curr_active_slave); /* see if any of the previous devices are up now (i.e. they have * xmt and rcv traffic). the curr_active_slave does not come into * the picture unless it is null. also, slave->last_link_up is not * needed here because we send an arp on each slave and give a slave * as long as it needs to get the tx/rx within the delta. * TODO: what about up/down delay in arp mode? it wasn't here before * so it can wait */ bond_for_each_slave_rcu(bond, slave, iter) { unsigned long last_tx = slave_last_tx(slave); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_tx, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin * mode. the window of a slave being up and * curr_active_slave being null after enslaving * is closed. */ if (!oldcurrent) { slave_info(bond->dev, slave->dev, "link status definitely up\n"); do_failover = 1; } else { slave_info(bond->dev, slave->dev, "interface is now up\n"); } } } else { /* slave->link == BOND_LINK_UP */ /* not all switches will respond to an arp request * when the source ip is 0, so don't take the link down * if we don't know our ip yet */ if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; slave_info(bond->dev, slave->dev, "interface is now down\n"); if (slave == oldcurrent) do_failover = 1; } } /* note: if switch is in round-robin mode, all links * must tx arp to ensure all links rx an arp - otherwise * links may oscillate or not come up at all; if switch is * in something like xor mode, there is nothing we can * do - all replies will be rx'ed on same link causing slaves * to be unstable during low/no traffic periods */ if (bond_slave_is_up(slave)) bond_send_validate(bond, slave); } rcu_read_unlock(); if (do_failover || slave_state_changed) { if (!rtnl_trylock()) goto re_arm; bond_for_each_slave(bond, slave, iter) { if (slave->link_new_state != BOND_LINK_NOCHANGE) slave->link = slave->link_new_state; } if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) bond_update_slave_arr(bond, NULL); } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } rtnl_unlock(); } re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, msecs_to_jiffies(bond->params.arp_interval)); } /* Called to inspect slaves for active-backup mode ARP monitor link state * changes. Sets proposed link state in slaves to specify what action * should take place for the slave. Returns 0 if no changes are found, >0 * if changes to link states must be committed. * * Called with rcu_read_lock held. */ static int bond_ab_arp_inspect(struct bonding *bond) { unsigned long last_tx, last_rx; struct list_head *iter; struct slave *slave; int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; } else if (slave->link == BOND_LINK_BACK) { bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; } continue; } /* Give slaves 2*delta after being enslaved or made * active. This avoids bouncing, as the last receive * times need a full ARP monitor cycle to be updated. */ if (bond_time_in_interval(bond, slave->last_link_up, 2)) continue; /* Backup slave is down if: * - No current_arp_slave AND * - more than (missed_max+1)*delta since last receive AND * - the bond has an IP address * * Note: a non-null current_arp_slave indicates * the curr_active_slave went down and we are * searching for a new one; under this condition * we only take the curr_active_slave down - this * gives each slave a chance to tx/rx traffic * before being taken out */ if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } /* Active slave is down if: * - more than missed_max*delta since transmitting OR * - (more than missed_max*delta since receive AND * the bond has an IP address) */ last_tx = slave_last_tx(slave); if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } return commit; } /* Called to commit link state changes noted by inspection step of * active-backup mode ARP monitor. * * Called with RTNL hold. */ static void bond_ab_arp_commit(struct bonding *bond) { bool do_failover = false; struct list_head *iter; unsigned long last_tx; struct slave *slave; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; case BOND_LINK_UP: last_tx = slave_last_tx(slave); if (rtnl_dereference(bond->curr_active_slave) != slave || (!rtnl_dereference(bond->curr_active_slave) && bond_time_in_interval(bond, last_tx, 1))) { struct slave *current_arp_slave; current_arp_slave = rtnl_dereference(bond->current_arp_slave); bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (current_arp_slave) { bond_set_slave_inactive_flags( current_arp_slave, BOND_SLAVE_NOTIFY_NOW); RCU_INIT_POINTER(bond->current_arp_slave, NULL); } slave_info(bond->dev, slave->dev, "link status definitely up\n"); if (!rtnl_dereference(bond->curr_active_slave) || slave == rtnl_dereference(bond->primary_slave) || slave->prio > rtnl_dereference(bond->curr_active_slave)->prio) do_failover = true; } continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); if (slave == rtnl_dereference(bond->curr_active_slave)) { RCU_INIT_POINTER(bond->current_arp_slave, NULL); do_failover = true; } continue; case BOND_LINK_FAIL: bond_set_slave_link_state(slave, BOND_LINK_FAIL, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); /* A slave has just been enslaved and has become * the current active slave. */ if (rtnl_dereference(bond->curr_active_slave)) RCU_INIT_POINTER(bond->current_arp_slave, NULL); continue; default: slave_err(bond->dev, slave->dev, "impossible: link_new_state %d on slave\n", slave->link_new_state); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* Send ARP probes for active-backup mode ARP monitor. * * Called with rcu_read_lock held. */ static bool bond_ab_arp_probe(struct bonding *bond) { struct slave *slave, *before = NULL, *new_slave = NULL, *curr_arp_slave = rcu_dereference(bond->current_arp_slave), *curr_active_slave = rcu_dereference(bond->curr_active_slave); struct list_head *iter; bool found = false; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; if (curr_arp_slave && curr_active_slave) netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n", curr_arp_slave->dev->name, curr_active_slave->dev->name); if (curr_active_slave) { bond_send_validate(bond, curr_active_slave); return should_notify_rtnl; } /* if we don't have a curr_active_slave, search for the next available * backup slave from the current_arp_slave and make it the candidate * for becoming the curr_active_slave */ if (!curr_arp_slave) { curr_arp_slave = bond_first_slave_rcu(bond); if (!curr_arp_slave) return should_notify_rtnl; } bond_for_each_slave_rcu(bond, slave, iter) { if (!found && !before && bond_slave_is_up(slave)) before = slave; if (found && !new_slave && bond_slave_is_up(slave)) new_slave = slave; /* if the link state is up at this point, we * mark it down - this can happen if we have * simultaneous link failures and * reselect_active_interface doesn't make this * one the current slave so it is still marked * up when it is actually down */ if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_LATER); if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_LATER); slave_info(bond->dev, slave->dev, "backup interface is now down\n"); } if (slave == curr_arp_slave) found = true; } if (!new_slave && before) new_slave = before; if (!new_slave) goto check_state; bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_LATER); bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER); bond_send_validate(bond, new_slave); new_slave->last_link_up = jiffies; rcu_assign_pointer(bond->current_arp_slave, new_slave); check_state: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify || slave->should_notify_link) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } return should_notify_rtnl; } static void bond_activebackup_arp_mon(struct bonding *bond) { bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); if (bond_ab_arp_inspect(bond)) { rcu_read_unlock(); /* Race avoidance with bond_close flush of workqueue */ if (!rtnl_trylock()) { delta_in_ticks = 1; should_notify_peers = false; goto re_arm; } bond_ab_arp_commit(bond); rtnl_unlock(); rcu_read_lock(); } should_notify_rtnl = bond_ab_arp_probe(bond); rcu_read_unlock(); re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); if (should_notify_peers || should_notify_rtnl) { if (!rtnl_trylock()) return; if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } if (should_notify_rtnl) { bond_slave_state_notify(bond); bond_slave_link_notify(bond); } rtnl_unlock(); } } static void bond_arp_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, arp_work.work); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_activebackup_arp_mon(bond); else bond_loadbalance_arp_mon(bond); } /*-------------------------- netdev event handling --------------------------*/ /* Change device name */ static int bond_event_changename(struct bonding *bond) { bond_remove_proc_entry(bond); bond_create_proc_entry(bond); bond_debug_reregister(bond); return NOTIFY_DONE; } static int bond_master_netdev_event(unsigned long event, struct net_device *bond_dev) { struct bonding *event_bond = netdev_priv(bond_dev); netdev_dbg(bond_dev, "%s called\n", __func__); switch (event) { case NETDEV_CHANGENAME: return bond_event_changename(event_bond); case NETDEV_UNREGISTER: bond_remove_proc_entry(event_bond); #ifdef CONFIG_XFRM_OFFLOAD xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true); #endif /* CONFIG_XFRM_OFFLOAD */ break; case NETDEV_REGISTER: bond_create_proc_entry(event_bond); break; default: break; } return NOTIFY_DONE; } static int bond_slave_netdev_event(unsigned long event, struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary; struct bonding *bond; struct net_device *bond_dev; /* A netdev event can be generated while enslaving a device * before netdev_rx_handler_register is called in which case * slave will be NULL */ if (!slave) { netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__); return NOTIFY_DONE; } bond_dev = slave->bond->dev; bond = slave->bond; primary = rtnl_dereference(bond->primary_slave); slave_dbg(bond_dev, slave_dev, "%s called\n", __func__); switch (event) { case NETDEV_UNREGISTER: if (bond_dev->type != ARPHRD_ETHER) bond_release_and_destroy(bond_dev, slave_dev); else __bond_release_one(bond_dev, slave_dev, false, true); break; case NETDEV_UP: case NETDEV_CHANGE: /* For 802.3ad mode only: * Getting invalid Speed/Duplex values here will put slave * in weird state. Mark it as link-fail if the link was * previously up or link-down if it hasn't yet come up, and * let link-monitoring (miimon) set it right when correct * speeds/duplex are available. */ if (bond_update_speed_duplex(slave) && BOND_MODE(bond) == BOND_MODE_8023AD) { if (slave->last_link_up) slave->link = BOND_LINK_FAIL; else slave->link = BOND_LINK_DOWN; } if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); fallthrough; case NETDEV_DOWN: /* Refresh slave-array if applicable! * If the setup does not use miimon or arpmon (mode-specific!), * then these events will not cause the slave-array to be * refreshed. This will cause xmit to use a slave that is not * usable. Avoid such situation by refeshing the array at these * events. If these (miimon/arpmon) parameters are configured * then array gets refreshed twice and that should be fine! */ if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); break; case NETDEV_CHANGEMTU: /* TODO: Should slaves be allowed to * independently alter their MTU? For * an active-backup bond, slaves need * not be the same type of device, so * MTUs may vary. For other modes, * slaves arguably should have the * same MTUs. To do this, we'd need to * take over the slave's change_mtu * function for the duration of their * servitude. */ break; case NETDEV_CHANGENAME: /* we don't care if we don't have primary set */ if (!bond_uses_primary(bond) || !bond->params.primary[0]) break; if (slave == primary) { /* slave's name changed - he's no longer primary */ RCU_INIT_POINTER(bond->primary_slave, NULL); } else if (!strcmp(slave_dev->name, bond->params.primary)) { /* we have a new primary slave */ rcu_assign_pointer(bond->primary_slave, slave); } else { /* we didn't change primary - exit */ break; } netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n", primary ? slave_dev->name : "none"); block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); break; case NETDEV_FEAT_CHANGE: if (!bond->notifier_ctx) { bond->notifier_ctx = true; bond_compute_features(bond); bond->notifier_ctx = false; } break; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, slave->bond->dev); break; case NETDEV_XDP_FEAT_CHANGE: bond_xdp_set_features(bond_dev); break; default: break; } return NOTIFY_DONE; } /* bond_netdev_event: handle netdev notifier chain events. * * This function receives events for the netdev chain. The caller (an * ioctl handler calling blocking_notifier_call_chain) holds the necessary * locks for us to safely manipulate the slave devices (RTNL lock, * dev_probe_lock). */ static int bond_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); netdev_dbg(event_dev, "%s received %s\n", __func__, netdev_cmd_to_name(event)); if (!(event_dev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; if (event_dev->flags & IFF_MASTER) { int ret; ret = bond_master_netdev_event(event, event_dev); if (ret != NOTIFY_DONE) return ret; } if (event_dev->flags & IFF_SLAVE) return bond_slave_netdev_event(event, event_dev); return NOTIFY_DONE; } static struct notifier_block bond_netdev_notifier = { .notifier_call = bond_netdev_event, }; /*---------------------------- Hashing Policies -----------------------------*/ /* Helper to access data in a packet, with or without a backing skb. * If skb is given the data is linearized if necessary via pskb_may_pull. */ static inline const void *bond_pull_data(struct sk_buff *skb, const void *data, int hlen, int n) { if (likely(n <= hlen)) return data; else if (skb && likely(pskb_may_pull(skb, n))) return skb->data; return NULL; } /* L2 hash helper */ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { struct ethhdr *ep; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; ep = (struct ethhdr *)(data + mhoff); return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto); } static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data, int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34) { const struct ipv6hdr *iph6; const struct iphdr *iph; if (l2_proto == htons(ETH_P_IP)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph)); if (!data) return false; iph = (const struct iphdr *)(data + *nhoff); iph_to_flow_copy_v4addrs(fk, iph); *nhoff += iph->ihl << 2; if (!ip_is_fragment(iph)) *ip_proto = iph->protocol; } else if (l2_proto == htons(ETH_P_IPV6)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6)); if (!data) return false; iph6 = (const struct ipv6hdr *)(data + *nhoff); iph_to_flow_copy_v6addrs(fk, iph6); *nhoff += sizeof(*iph6); *ip_proto = iph6->nexthdr; } else { return false; } if (l34 && *ip_proto >= 0) fk->ports.ports = skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen); return true; } static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { u32 srcmac_vendor = 0, srcmac_dev = 0; struct ethhdr *mac_hdr; u16 vlan = 0; int i; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; mac_hdr = (struct ethhdr *)(data + mhoff); for (i = 0; i < 3; i++) srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i]; for (i = 3; i < ETH_ALEN; i++) srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i]; if (skb && skb_vlan_tag_present(skb)) vlan = skb_vlan_tag_get(skb); return vlan ^ srcmac_vendor ^ srcmac_dev; } /* Extract the appropriate headers based on bond's xmit policy */ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk) { bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; int ip_proto = -1; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_ENCAP23: case BOND_XMIT_POLICY_ENCAP34: memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, fk, data, l2_proto, nhoff, hlen, 0); default: break; } fk->ports.ports = 0; memset(&fk->icmp, 0, sizeof(fk->icmp)); if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34)) return false; /* ICMP error packets contains at least 8 bytes of the header * of the packet which generated the error. Use this information * to correlate ICMP error packets within the same flow which * generated the error. */ if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen); if (ip_proto == IPPROTO_ICMP) { if (!icmp_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmphdr); } else if (ip_proto == IPPROTO_ICMPV6) { if (!icmpv6_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmp6hdr); } return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34); } return true; } static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy) { hash ^= (__force u32)flow_get_u32_dst(flow) ^ (__force u32)flow_get_u32_src(flow); hash ^= (hash >> 16); hash ^= (hash >> 8); /* discard lowest hash bit to deal with the common even ports pattern */ if (xmit_policy == BOND_XMIT_POLICY_LAYER34 || xmit_policy == BOND_XMIT_POLICY_ENCAP34) return hash >> 1; return hash; } /* Generate hash based on xmit policy. If @skb is given it is used to linearize * the data as required, but this function can be used without it if the data is * known to be linear (e.g. with xdp_buff). */ static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int mhoff, int nhoff, int hlen) { struct flow_keys flow; u32 hash; if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC) return bond_vlan_srcmac_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow)) return bond_eth_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb, data, mhoff, hlen); } else { if (flow.icmp.id) memcpy(&hash, &flow.icmp, sizeof(hash)); else memcpy(&hash, &flow.ports.ports, sizeof(hash)); } return bond_ip_hash(hash, &flow, bond->params.xmit_policy); } /** * bond_xmit_hash - generate a hash value based on the xmit policy * @bond: bonding device * @skb: buffer to use for headers * * This function will extract the necessary headers from the skb buffer and use * them to generate a hash based on the xmit_policy set in the bonding device */ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) { if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && skb->l4_hash) return skb->hash; return __bond_xmit_hash(bond, skb, skb->data, skb->protocol, 0, skb_network_offset(skb), skb_headlen(skb)); } /** * bond_xmit_hash_xdp - generate a hash value based on the xmit policy * @bond: bonding device * @xdp: buffer to use for headers * * The XDP variant of bond_xmit_hash. */ static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp) { struct ethhdr *eth; if (xdp->data + sizeof(struct ethhdr) > xdp->data_end) return 0; eth = (struct ethhdr *)xdp->data; return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0, sizeof(struct ethhdr), xdp->data_end - xdp->data); } /*-------------------------- Device entry points ----------------------------*/ void bond_work_init_all(struct bonding *bond) { INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } static void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); cancel_delayed_work_sync(&bond->alb_work); cancel_delayed_work_sync(&bond->ad_work); cancel_delayed_work_sync(&bond->mcast_work); cancel_delayed_work_sync(&bond->slave_arr_work); } static int bond_open(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) { bond->rr_tx_counter = alloc_percpu(u32); if (!bond->rr_tx_counter) return -ENOMEM; } /* reset slave->backup and slave->inactive */ if (bond_has_slaves(bond)) { bond_for_each_slave(bond, slave, iter) { if (bond_uses_primary(bond) && slave != rcu_access_pointer(bond->curr_active_slave)) { bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); } else if (BOND_MODE(bond) != BOND_MODE_8023AD) { bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_NOW); } } } if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. */ if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB))) return -ENOMEM; if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB) queue_delayed_work(bond->wq, &bond->alb_work, 0); } if (bond->params.miimon) /* link check interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->mii_work, 0); if (bond->params.arp_interval) { /* arp interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->arp_work, 0); bond->recv_probe = bond_rcv_validate; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { queue_delayed_work(bond->wq, &bond->ad_work, 0); /* register to receive LACPDUs */ bond->recv_probe = bond_3ad_lacpdu_recv; bond_3ad_initiate_agg_selection(bond, 1); bond_for_each_slave(bond, slave, iter) dev_mc_add(slave->dev, lacpdu_mcast_addr); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); return 0; } static int bond_close(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; bond_work_cancel_all(bond); bond->send_peer_notif = 0; if (bond_is_lb(bond)) bond_alb_deinitialize(bond); bond->recv_probe = NULL; if (bond_uses_primary(bond)) { rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (slave) bond_hw_addr_flush(bond_dev, slave->dev); rcu_read_unlock(); } else { struct list_head *iter; bond_for_each_slave(bond, slave, iter) bond_hw_addr_flush(bond_dev, slave->dev); } return 0; } /* fold stats, assuming all rtnl_link_stats64 fields are u64, but * that some drivers can provide 32bit values only. */ static void bond_fold_stats(struct rtnl_link_stats64 *_res, const struct rtnl_link_stats64 *_new, const struct rtnl_link_stats64 *_old) { const u64 *new = (const u64 *)_new; const u64 *old = (const u64 *)_old; u64 *res = (u64 *)_res; int i; for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { u64 nv = new[i]; u64 ov = old[i]; s64 delta = nv - ov; /* detects if this particular field is 32bit only */ if (((nv | ov) >> 32) == 0) delta = (s64)(s32)((u32)nv - (u32)ov); /* filter anomalies, some drivers reset their stats * at down/up events. */ if (delta > 0) res[i] += delta; } } #ifdef CONFIG_LOCKDEP static int bond_get_lowest_level_rcu(struct net_device *dev) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int cur = 0, max = 0; now = dev; iter = &dev->adj_list.lower; while (1) { next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; if (max <= cur) max = cur; break; } if (!next) { if (!cur) return max; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return max; } #endif static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats) { struct bonding *bond = netdev_priv(bond_dev); struct rtnl_link_stats64 temp; struct list_head *iter; struct slave *slave; int nest_level = 0; rcu_read_lock(); #ifdef CONFIG_LOCKDEP nest_level = bond_get_lowest_level_rcu(bond_dev); #endif spin_lock_nested(&bond->stats_lock, nest_level); memcpy(stats, &bond->bond_stats, sizeof(*stats)); bond_for_each_slave_rcu(bond, slave, iter) { const struct rtnl_link_stats64 *new = dev_get_stats(slave->dev, &temp); bond_fold_stats(stats, new, &slave->slave_stats); /* save off the slave stats for the next run */ memcpy(&slave->slave_stats, new, sizeof(*new)); } memcpy(&bond->bond_stats, stats, sizeof(*stats)); spin_unlock(&bond->stats_lock); rcu_read_unlock(); } static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct mii_ioctl_data *mii = NULL; netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCGMIIPHY: mii = if_mii(ifr); if (!mii) return -EINVAL; mii->phy_id = 0; fallthrough; case SIOCGMIIREG: /* We do this again just in case we were called by SIOCGMIIREG * instead of SIOCGMIIPHY. */ mii = if_mii(ifr); if (!mii) return -EINVAL; if (mii->reg_num == 1) { mii->val_out = 0; if (netif_carrier_ok(bond->dev)) mii->val_out = BMSR_LSTATUS; } break; default: return -EOPNOTSUPP; } return 0; } static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct net_device *slave_dev = NULL; struct ifbond k_binfo; struct ifbond __user *u_binfo = NULL; struct ifslave k_sinfo; struct ifslave __user *u_sinfo = NULL; struct bond_opt_value newval; struct net *net; int res = 0; netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCBONDINFOQUERY: u_binfo = (struct ifbond __user *)ifr->ifr_data; if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond))) return -EFAULT; bond_info_query(bond_dev, &k_binfo); if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond))) return -EFAULT; return 0; case SIOCBONDSLAVEINFOQUERY: u_sinfo = (struct ifslave __user *)ifr->ifr_data; if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave))) return -EFAULT; res = bond_slave_info_query(bond_dev, &k_sinfo); if (res == 0 && copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave))) return -EFAULT; return res; default: break; } net = dev_net(bond_dev); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; slave_dev = __dev_get_by_name(net, ifr->ifr_slave); slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev); if (!slave_dev) return -ENODEV; switch (cmd) { case SIOCBONDENSLAVE: res = bond_enslave(bond_dev, slave_dev, NULL); break; case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); break; case SIOCBONDSETHWADDR: res = bond_set_dev_addr(bond_dev, slave_dev); break; case SIOCBONDCHANGEACTIVE: bond_opt_initstr(&newval, slave_dev->name); res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE, &newval); break; default: res = -EOPNOTSUPP; } return res; } static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr, void __user *data, int cmd) { struct ifreq ifrdata = { .ifr_data = data }; switch (cmd) { case BOND_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY); case BOND_SLAVE_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY); case BOND_ENSLAVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE); case BOND_RELEASE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE); case BOND_SETHWADDR_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR); case BOND_CHANGE_ACTIVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE); } return -EOPNOTSUPP; } static void bond_change_rx_flags(struct net_device *bond_dev, int change) { struct bonding *bond = netdev_priv(bond_dev); if (change & IFF_PROMISC) bond_set_promiscuity(bond, bond_dev->flags & IFF_PROMISC ? 1 : -1); if (change & IFF_ALLMULTI) bond_set_allmulti(bond, bond_dev->flags & IFF_ALLMULTI ? 1 : -1); } static void bond_set_rx_mode(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; rcu_read_lock(); if (bond_uses_primary(bond)) { slave = rcu_dereference(bond->curr_active_slave); if (slave) { dev_uc_sync(slave->dev, bond_dev); dev_mc_sync(slave->dev, bond_dev); } } else { bond_for_each_slave_rcu(bond, slave, iter) { dev_uc_sync_multiple(slave->dev, bond_dev); dev_mc_sync_multiple(slave->dev, bond_dev); } } rcu_read_unlock(); } static int bond_neigh_init(struct neighbour *n) { struct bonding *bond = netdev_priv(n->dev); const struct net_device_ops *slave_ops; struct neigh_parms parms; struct slave *slave; int ret = 0; rcu_read_lock(); slave = bond_first_slave_rcu(bond); if (!slave) goto out; slave_ops = slave->dev->netdev_ops; if (!slave_ops->ndo_neigh_setup) goto out; /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, * but at least we do not pass garbage. * * [1] One way would be that ndo_neigh_setup() never touch * struct neigh_parms, but propagate the new neigh_setup() * back to ___neigh_create() / neigh_parms_alloc() */ memset(&parms, 0, sizeof(parms)); ret = slave_ops->ndo_neigh_setup(slave->dev, &parms); if (ret) goto out; if (parms.neigh_setup) ret = parms.neigh_setup(n); out: rcu_read_unlock(); return ret; } /* The bonding ndo_neigh_setup is called at init time beofre any * slave exists. So we must declare proxy setup function which will * be used at run time to resolve the actual slave neigh param setup. * * It's also called by master devices (such as vlans) to setup their * underlying devices. In that case - do nothing, we're already set up from * our init. */ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms) { /* modify only our neigh_parms */ if (parms->dev == dev) parms->neigh_setup = bond_neigh_init; return 0; } /* Change the MTU of all of a master's slaves to match the master */ static int bond_change_mtu(struct net_device *bond_dev, int new_mtu) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res = 0; netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu); bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n", slave, slave->dev->netdev_ops->ndo_change_mtu); res = dev_set_mtu(slave->dev, new_mtu); if (res) { /* If we failed to set the slave's mtu to the new value * we must abort the operation even in ACTIVE_BACKUP * mode, because if we allow the backup slaves to have * different mtu values than the active slave we'll * need to change their mtu when doing a failover. That * means changing their mtu from timer context, which * is probably not a good idea. */ slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n", res, new_mtu); goto unwind; } } WRITE_ONCE(bond_dev->mtu, new_mtu); return 0; unwind: /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu); if (tmp_res) slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n", tmp_res); } return res; } /* Change HW address * * Note that many devices must be down to change the HW address, and * downing the master releases all slaves. We can make bonds full of * bonding devices to test this, however. */ static int bond_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct sockaddr_storage *ss = addr, tmp_ss; struct list_head *iter; int res = 0; if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond); /* If fail_over_mac is enabled, do nothing and return success. * Returning an error causes ifenslave to fail. */ if (bond->params.fail_over_mac && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) return 0; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n", __func__, slave); res = dev_set_mac_address(slave->dev, addr, NULL); if (res) { /* TODO: consider downing the slave * and retry ? * User should expect communications * breakage anyway until ARP finish * updating, so... */ slave_dbg(bond_dev, slave->dev, "%s: err %d\n", __func__, res); goto unwind; } } /* success */ dev_addr_set(bond_dev, ss->__data); return 0; unwind: memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len); tmp_ss.ss_family = bond_dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mac_address(rollback_slave->dev, &tmp_ss, NULL); if (tmp_res) { slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n", __func__, tmp_res); } } return res; } /** * bond_get_slave_by_id - get xmit slave with slave_id * @bond: bonding device that is transmitting * @slave_id: slave id up to slave_cnt-1 through which to transmit * * This function tries to get slave with slave_id but in case * it fails, it tries to find the first available slave for transmission. */ static struct slave *bond_get_slave_by_id(struct bonding *bond, int slave_id) { struct list_head *iter; struct slave *slave; int i = slave_id; /* Here we start from the slave with slave_id */ bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) { if (bond_slave_can_tx(slave)) return slave; } } /* Here we start from the first slave up to slave_id */ i = slave_id; bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) break; if (bond_slave_can_tx(slave)) return slave; } /* no slave that can tx has been found */ return NULL; } /** * bond_rr_gen_slave_id - generate slave id based on packets_per_slave * @bond: bonding device to use * * Based on the value of the bonding device's packets_per_slave parameter * this function generates a slave id, which is usually used as the next * slave to transmit through. */ static u32 bond_rr_gen_slave_id(struct bonding *bond) { u32 slave_id; struct reciprocal_value reciprocal_packets_per_slave; int packets_per_slave = bond->params.packets_per_slave; switch (packets_per_slave) { case 0: slave_id = get_random_u32(); break; case 1: slave_id = this_cpu_inc_return(*bond->rr_tx_counter); break; default: reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave; slave_id = this_cpu_inc_return(*bond->rr_tx_counter); slave_id = reciprocal_divide(slave_id, reciprocal_packets_per_slave); break; } return slave_id; } static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *slave; int slave_cnt; u32 slave_id; /* Start with the curr_active_slave that joined the bond as the * default for sending IGMP traffic. For failover purposes one * needs to maintain some consistency for the interface that will * send the join/membership reports. The curr_active_slave found * will send all of this type of traffic. */ if (skb->protocol == htons(ETH_P_IP)) { int noff = skb_network_offset(skb); struct iphdr *iph; if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) goto non_igmp; iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct slave *slave; int slave_cnt; u32 slave_id; const struct ethhdr *eth; void *data = xdp->data; if (data + sizeof(struct ethhdr) > xdp->data_end) goto non_igmp; eth = (struct ethhdr *)data; data += sizeof(struct ethhdr); /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */ if (eth->h_proto == htons(ETH_P_IP)) { const struct iphdr *iph; if (data + sizeof(struct iphdr) > xdp->data_end) goto non_igmp; iph = (struct iphdr *)data; if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_roundrobin_slave_get(bond, skb); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond) { return rcu_dereference(bond->curr_active_slave); } /* In active-backup mode, we know that bond->curr_active_slave is always valid if * the bond has a usable interface. */ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_activebackup_slave_get(bond); if (slave) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } /* Use this to update slave_array when (a) it's not appropriate to update * slave_array right away (note that update_slave_array() may sleep) * and / or (b) RTNL is not held. */ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay) { queue_delayed_work(bond->wq, &bond->slave_arr_work, delay); } /* Slave array work handler. Holds only RTNL */ static void bond_slave_arr_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, slave_arr_work.work); int ret; if (!rtnl_trylock()) goto err; ret = bond_update_slave_arr(bond, NULL); rtnl_unlock(); if (ret) { pr_warn_ratelimited("Failed to update slave array from WT\n"); goto err; } return; err: bond_slave_arr_work_rearm(bond, 1); } static void bond_skip_slave(struct bond_up_slave *slaves, struct slave *skipslave) { int idx; /* Rare situation where caller has asked to skip a specific * slave but allocation failed (most likely!). BTW this is * only possible when the call is initiated from * __bond_release_one(). In this situation; overwrite the * skipslave entry in the array with the last entry from the * array to avoid a situation where the xmit path may choose * this to-be-skipped slave to send a packet out. */ for (idx = 0; slaves && idx < slaves->count; idx++) { if (skipslave == slaves->arr[idx]) { slaves->arr[idx] = slaves->arr[slaves->count - 1]; slaves->count--; break; } } } static void bond_set_slave_arr(struct bonding *bond, struct bond_up_slave *usable_slaves, struct bond_up_slave *all_slaves) { struct bond_up_slave *usable, *all; usable = rtnl_dereference(bond->usable_slaves); rcu_assign_pointer(bond->usable_slaves, usable_slaves); kfree_rcu(usable, rcu); all = rtnl_dereference(bond->all_slaves); rcu_assign_pointer(bond->all_slaves, all_slaves); kfree_rcu(all, rcu); } static void bond_reset_slave_arr(struct bonding *bond) { bond_set_slave_arr(bond, NULL, NULL); } /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD * (b) BOND_MODE_XOR * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0 * * The caller is expected to hold RTNL only and NO other lock! */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL; struct slave *slave; struct list_head *iter; int agg_id = 0; int ret = 0; might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); all_slaves = kzalloc(struct_size(all_slaves, arr, bond->slave_cnt), GFP_KERNEL); if (!usable_slaves || !all_slaves) { ret = -ENOMEM; goto out; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. */ bond_reset_slave_arr(bond); goto out; } spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (!agg || agg->aggregator_identifier != agg_id) continue; } if (!bond_slave_can_tx(slave)) continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", usable_slaves->count); usable_slaves->arr[usable_slaves->count++] = slave; } bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; out: if (ret != 0 && skipslave) { bond_skip_slave(rtnl_dereference(bond->all_slaves), skipslave); bond_skip_slave(rtnl_dereference(bond->usable_slaves), skipslave); } kfree_rcu(all_slaves, rcu); kfree_rcu(usable_slaves, rcu); return ret; } static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, struct sk_buff *skb, struct bond_up_slave *slaves) { struct slave *slave; unsigned int count; u32 hash; hash = bond_xmit_hash(bond, skb); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; slave = slaves->arr[hash % count]; return slave; } static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct bond_up_slave *slaves; unsigned int count; u32 hash; hash = bond_xmit_hash_xdp(bond, xdp); slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; return slaves->arr[hash % count]; } /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. */ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct bond_up_slave *slaves; struct slave *slave; slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(dev, skb); } /* in broadcast mode, we send everything to all usable interfaces. */ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; bool xmit_suc = false; bool skb_used = false; bond_for_each_slave_rcu(bond, slave, iter) { struct sk_buff *skb2; if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) continue; if (bond_is_last_slave(bond, slave)) { skb2 = skb; skb_used = true; } else { skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { net_err_ratelimited("%s: Error: %s: skb_clone() failed\n", bond_dev->name, __func__); continue; } } if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK) xmit_suc = true; } if (!skb_used) dev_kfree_skb_any(skb); if (xmit_suc) return NETDEV_TX_OK; dev_core_stats_tx_dropped_inc(bond_dev); return NET_XMIT_DROP; } /*------------------------- Device initialization ---------------------------*/ /* Lookup the slave that corresponds to a qid */ static inline int bond_slave_override(struct bonding *bond, struct sk_buff *skb) { struct slave *slave = NULL; struct list_head *iter; if (!skb_rx_queue_recorded(skb)) return 1; /* Find out if any slaves have the same mapping as this skb. */ bond_for_each_slave_rcu(bond, slave, iter) { if (READ_ONCE(slave->queue_id) == skb_get_queue_mapping(skb)) { if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_dev_queue_xmit(bond, skb, slave->dev); return 0; } /* If the slave isn't UP, use default transmit policy. */ break; } } return 1; } static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the bonding driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb); if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, struct sk_buff *skb, bool all_slaves) { struct bonding *bond = netdev_priv(master_dev); struct bond_up_slave *slaves; struct slave *slave = NULL; switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xmit_roundrobin_slave_get(bond, skb); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: if (all_slaves) slaves = rcu_dereference(bond->all_slaves); else slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); break; case BOND_MODE_BROADCAST: break; case BOND_MODE_ALB: slave = bond_xmit_alb_slave_get(bond, skb); break; case BOND_MODE_TLB: slave = bond_xmit_tlb_slave_get(bond, skb); break; default: /* Should never happen, mode already checked */ WARN_ONCE(true, "Unknown bonding mode"); break; } if (slave) return slave->dev; return NULL; } static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow) { switch (sk->sk_family) { #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (ipv6_only_sock(sk) || ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) { flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; flow->addrs.v6addrs.src = inet6_sk(sk)->saddr; flow->addrs.v6addrs.dst = sk->sk_v6_daddr; break; } fallthrough; #endif default: /* AF_INET */ flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr; flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr; break; } flow->ports.src = inet_sk(sk)->inet_sport; flow->ports.dst = inet_sk(sk)->inet_dport; } /** * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields * @sk: socket to use for headers * * This function will extract the necessary field from the socket and use * them to generate a hash based on the LAYER34 xmit_policy. * Assumes that sk is a TCP or UDP socket. */ static u32 bond_sk_hash_l34(struct sock *sk) { struct flow_keys flow; u32 hash; bond_sk_to_flow(sk, &flow); /* L4 */ memcpy(&hash, &flow.ports.ports, sizeof(hash)); /* L3 */ return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34); } static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, struct sock *sk) { struct bond_up_slave *slaves; struct slave *slave; unsigned int count; u32 hash; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; hash = bond_sk_hash_l34(sk); slave = slaves->arr[hash % count]; return slave->dev; } static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { struct bonding *bond = netdev_priv(dev); struct net_device *lower = NULL; rcu_read_lock(); if (bond_sk_check(bond)) lower = __bond_sk_get_lower_dev(bond, sk); rcu_read_unlock(); return lower; } #if IS_ENABLED(CONFIG_TLS_DEVICE) static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *dev) { struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev); /* tls_netdev might become NULL, even if tls_is_skb_tx_device_offloaded * was true, if tls_device_down is running in parallel, but it's OK, * because bond_get_slave_by_dev has a NULL check. */ if (likely(bond_get_slave_by_dev(bond, tls_netdev))) return bond_dev_queue_xmit(bond, skb, tls_netdev); return bond_tx_drop(dev, skb); } #endif static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); if (bond_should_override_tx_queue(bond) && !bond_slave_override(bond, skb)) return NETDEV_TX_OK; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (tls_is_skb_tx_device_offloaded(skb)) return bond_tls_device_xmit(bond, skb, dev); #endif switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: case BOND_MODE_XOR: return bond_3ad_xor_xmit(skb, dev); case BOND_MODE_BROADCAST: return bond_xmit_broadcast(skb, dev); case BOND_MODE_ALB: return bond_alb_xmit(skb, dev); case BOND_MODE_TLB: return bond_tlb_xmit(skb, dev); default: /* Should never happen, mode already checked */ netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond)); WARN_ON_ONCE(1); return bond_tx_drop(dev, skb); } } static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); netdev_tx_t ret = NETDEV_TX_OK; /* If we risk deadlock from transmitting this in the * netpoll path, tell netpoll to queue the frame for later tx */ if (unlikely(is_netpoll_tx_blocked(dev))) return NETDEV_TX_BUSY; rcu_read_lock(); if (bond_has_slaves(bond)) ret = __bond_start_xmit(skb, dev); else ret = bond_tx_drop(dev, skb); rcu_read_unlock(); return ret; } static struct net_device * bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; /* Caller needs to hold rcu_read_lock() */ switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp); break; default: if (net_ratelimit()) netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); return NULL; } if (slave) return slave->dev; return NULL; } static int bond_xdp_xmit(struct net_device *bond_dev, int n, struct xdp_frame **frames, u32 flags) { int nxmit, err = -ENXIO; rcu_read_lock(); for (nxmit = 0; nxmit < n; nxmit++) { struct xdp_frame *frame = frames[nxmit]; struct xdp_frame *frames1[] = {frame}; struct net_device *slave_dev; struct xdp_buff xdp; xdp_convert_frame_to_buff(frame, &xdp); slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp); if (!slave_dev) { err = -ENXIO; break; } err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags); if (err < 1) break; } rcu_read_unlock(); /* If error happened on the first frame then we can pass the error up, otherwise * report the number of frames that were xmitted. */ if (err < 0) return (nxmit == 0 ? err : nxmit); return nxmit; } static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave, *rollback_slave; struct bpf_prog *old_prog; struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = prog, .extack = extack, }; int err; ASSERT_RTNL(); if (!bond_xdp_check(bond, BOND_MODE(bond))) { BOND_NL_ERR(dev, extack, "No native XDP support for the current bonding mode"); return -EOPNOTSUPP; } old_prog = bond->xdp_prog; bond->xdp_prog = prog; bond_for_each_slave(bond, slave, iter) { struct net_device *slave_dev = slave->dev; if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave device does not support XDP"); err = -EOPNOTSUPP; goto err; } if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); err = -EOPNOTSUPP; goto err; } err = dev_xdp_propagate(slave_dev, &xdp); if (err < 0) { /* ndo_bpf() sets extack error message */ slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err); goto err; } if (prog) bpf_prog_inc(prog); } if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); } else if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); } return 0; err: /* unwind the program changes */ bond->xdp_prog = old_prog; xdp.prog = old_prog; xdp.extack = NULL; /* do not overwrite original error */ bond_for_each_slave(bond, rollback_slave, iter) { struct net_device *slave_dev = rollback_slave->dev; int err_unwind; if (slave == rollback_slave) break; err_unwind = dev_xdp_propagate(slave_dev, &xdp); if (err_unwind < 0) slave_err(dev, slave_dev, "Error %d when unwinding XDP program change\n", err_unwind); else if (xdp.prog) bpf_prog_inc(xdp.prog); } return err; } static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return bond_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) { if (speed == 0 || speed == SPEED_UNKNOWN) speed = slave->speed; else speed = min(speed, slave->speed); return speed; } /* Set the BOND_PHC_INDEX flag to notify user space */ static int bond_set_phc_index_flag(struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq *ifr = kernel_cfg->ifr; struct hwtstamp_config cfg; if (kernel_cfg->copied_to_user) { /* Lower device has a legacy implementation */ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; cfg.flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } else { kernel_cfg->flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; } return 0; } static int bond_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_get_lower(real_dev, cfg); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; if (!(cfg->flags & HWTSTAMP_FLAG_BONDED_PHC_INDEX)) return -EOPNOTSUPP; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_set_lower(real_dev, cfg, extack); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; /* Since bond_slave_can_tx returns false for all inactive or down slaves, we * do not need to check mode. Though link speed might not represent * the true receive or transmit bandwidth (not all modes are symmetric) * this is an accurate maximum. */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { bond_update_speed_duplex(slave); if (slave->speed != SPEED_UNKNOWN) { if (BOND_MODE(bond) == BOND_MODE_BROADCAST) speed = bond_mode_bcast_speed(slave, speed); else speed += slave->speed; } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; } } cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static void bond_ethtool_get_drvinfo(struct net_device *bond_dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d", BOND_ABI_VERSION); } static int bond_ethtool_get_ts_info(struct net_device *bond_dev, struct kernel_ethtool_ts_info *info) { struct bonding *bond = netdev_priv(bond_dev); struct kernel_ethtool_ts_info ts_info; struct net_device *real_dev; bool sw_tx_support = false; struct list_head *iter; struct slave *slave; int ret = 0; rcu_read_lock(); real_dev = bond_option_active_slave_get_rcu(bond); dev_hold(real_dev); rcu_read_unlock(); if (real_dev) { ret = ethtool_get_ts_info_by_layer(real_dev, info); } else { /* Check if all slaves support software tx timestamping */ rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info); if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) { sw_tx_support = true; continue; } sw_tx_support = false; break; } rcu_read_unlock(); } if (sw_tx_support) info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; dev_put(real_dev); return ret; } static const struct ethtool_ops bond_ethtool_ops = { .get_drvinfo = bond_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = bond_ethtool_get_link_ksettings, .get_ts_info = bond_ethtool_get_ts_info, }; static const struct net_device_ops bond_netdev_ops = { .ndo_init = bond_init, .ndo_uninit = bond_uninit, .ndo_open = bond_open, .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, .ndo_select_queue = bond_select_queue, .ndo_get_stats64 = bond_get_stats, .ndo_eth_ioctl = bond_eth_ioctl, .ndo_siocbond = bond_do_ioctl, .ndo_siocdevprivate = bond_siocdevprivate, .ndo_change_rx_flags = bond_change_rx_flags, .ndo_set_rx_mode = bond_set_rx_mode, .ndo_change_mtu = bond_change_mtu, .ndo_set_mac_address = bond_set_mac_address, .ndo_neigh_setup = bond_neigh_setup, .ndo_vlan_rx_add_vid = bond_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bond_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = bond_netpoll_setup, .ndo_netpoll_cleanup = bond_netpoll_cleanup, .ndo_poll_controller = bond_poll_controller, #endif .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, .ndo_sk_get_lower_dev = bond_sk_get_lower_dev, .ndo_bpf = bond_xdp, .ndo_xdp_xmit = bond_xdp_xmit, .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave, .ndo_hwtstamp_get = bond_hwtstamp_get, .ndo_hwtstamp_set = bond_hwtstamp_set, }; static const struct device_type bond_type = { .name = "bond", }; static void bond_destructor(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); if (bond->wq) destroy_workqueue(bond->wq); free_percpu(bond->rr_tx_counter); } void bond_setup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); spin_lock_init(&bond->mode_lock); bond->params = bonding_defaults; /* Initialize pointers */ bond->dev = bond_dev; /* Initialize the device entry points */ ether_setup(bond_dev); bond_dev->max_mtu = ETH_MAX_MTU; bond_dev->netdev_ops = &bond_netdev_ops; bond_dev->ethtool_ops = &bond_ethtool_ops; bond_dev->needs_free_netdev = true; bond_dev->priv_destructor = bond_destructor; SET_NETDEV_DEVTYPE(bond_dev, &bond_type); /* Initialize the device options */ bond_dev->flags |= IFF_MASTER; bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE; bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); #ifdef CONFIG_XFRM_OFFLOAD /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); mutex_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ bond_dev->lltx = true; /* Don't allow bond devices to change network namespaces. */ bond_dev->netns_immutable = true; /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special * care is taken in the various xmit functions * when there are slaves that are not hw accel * capable */ bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; bond_dev->features |= NETIF_F_GSO_PARTIAL; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_dev->features |= BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ } /* Destroy a bonding device. * Must be under rtnl_lock when this function is called. */ static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_netpoll_cleanup(bond_dev); /* Release the bonded slaves */ bond_for_each_slave(bond, slave, iter) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); #ifdef CONFIG_XFRM_OFFLOAD mutex_destroy(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ bond_set_slave_arr(bond, NULL, NULL); list_del_rcu(&bond->bond_list); bond_debug_unregister(bond); } /*------------------------- Module initialization ---------------------------*/ static int __init bond_check_params(struct bond_params *params) { int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; struct bond_opt_value newval; const struct bond_opt_value *valptr; int arp_all_targets_value = 0; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 }; int arp_ip_count; int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; int lacp_fast = 0; int tlb_dynamic_lb; /* Convert string parameters. */ if (mode) { bond_opt_initstr(&newval, mode); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval); if (!valptr) { pr_err("Error: Invalid bonding mode \"%s\"\n", mode); return -EINVAL; } bond_mode = valptr->value; } if (xmit_hash_policy) { if (bond_mode == BOND_MODE_ROUNDROBIN || bond_mode == BOND_MODE_ACTIVEBACKUP || bond_mode == BOND_MODE_BROADCAST) { pr_info("xmit_hash_policy param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, xmit_hash_policy); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH), &newval); if (!valptr) { pr_err("Error: Invalid xmit_hash_policy \"%s\"\n", xmit_hash_policy); return -EINVAL; } xmit_hashtype = valptr->value; } } if (lacp_rate) { if (bond_mode != BOND_MODE_8023AD) { pr_info("lacp_rate param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, lacp_rate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE), &newval); if (!valptr) { pr_err("Error: Invalid lacp rate \"%s\"\n", lacp_rate); return -EINVAL; } lacp_fast = valptr->value; } } if (ad_select) { bond_opt_initstr(&newval, ad_select); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT), &newval); if (!valptr) { pr_err("Error: Invalid ad_select \"%s\"\n", ad_select); return -EINVAL; } params->ad_select = valptr->value; if (bond_mode != BOND_MODE_8023AD) pr_warn("ad_select param only affects 802.3ad mode\n"); } else { params->ad_select = BOND_AD_STABLE; } if (max_bonds < 0) { pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n", max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS); max_bonds = BOND_DEFAULT_MAX_BONDS; } if (miimon < 0) { pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n", miimon, INT_MAX); miimon = 0; } if (updelay < 0) { pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", updelay, INT_MAX); updelay = 0; } if (downdelay < 0) { pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", downdelay, INT_MAX); downdelay = 0; } if ((use_carrier != 0) && (use_carrier != 1)) { pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", use_carrier); use_carrier = 1; } if (num_peer_notif < 0 || num_peer_notif > 255) { pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n", num_peer_notif); num_peer_notif = 1; } /* reset values for 802.3ad/TLB/ALB */ if (!bond_mode_uses_arp(bond_mode)) { if (!miimon) { pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n"); pr_warn("Forcing miimon to 100msec\n"); miimon = BOND_DEFAULT_MIIMON; } } if (tx_queues < 1 || tx_queues > 255) { pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n", tx_queues, BOND_DEFAULT_TX_QUEUES); tx_queues = BOND_DEFAULT_TX_QUEUES; } if ((all_slaves_active != 0) && (all_slaves_active != 1)) { pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n", all_slaves_active); all_slaves_active = 0; } if (resend_igmp < 0 || resend_igmp > 255) { pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n", resend_igmp, BOND_DEFAULT_RESEND_IGMP); resend_igmp = BOND_DEFAULT_RESEND_IGMP; } bond_opt_initval(&newval, packets_per_slave); if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) { pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n", packets_per_slave, USHRT_MAX); packets_per_slave = 1; } if (bond_mode == BOND_MODE_ALB) { pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n", updelay); } if (!miimon) { if (updelay || downdelay) { /* just warn the user the up/down delay will have * no effect since miimon is zero... */ pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n", updelay, downdelay); } } else { /* don't allow arp monitoring */ if (arp_interval) { pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n", miimon, arp_interval); arp_interval = 0; } if ((updelay % miimon) != 0) { pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", updelay, miimon, (updelay / miimon) * miimon); } updelay /= miimon; if ((downdelay % miimon) != 0) { pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n", downdelay, miimon, (downdelay / miimon) * miimon); } downdelay /= miimon; } if (arp_interval < 0) { pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n", arp_interval, INT_MAX); arp_interval = 0; } for (arp_ip_count = 0, i = 0; (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) { __be32 ip; /* not a complete check, but good enough to catch mistakes */ if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) || !bond_is_ip_target_ok(ip)) { pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n", arp_ip_target[i]); arp_interval = 0; } else { if (bond_get_targets_ip(arp_target, ip) == -1) arp_target[arp_ip_count++] = ip; else pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", &ip); } } if (arp_interval && !arp_ip_count) { /* don't allow arping if no arp_ip_target given... */ pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n", arp_interval); arp_interval = 0; } if (arp_validate) { if (!arp_interval) { pr_err("arp_validate requires arp_interval\n"); return -EINVAL; } bond_opt_initstr(&newval, arp_validate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE), &newval); if (!valptr) { pr_err("Error: invalid arp_validate \"%s\"\n", arp_validate); return -EINVAL; } arp_validate_value = valptr->value; } else { arp_validate_value = 0; } if (arp_all_targets) { bond_opt_initstr(&newval, arp_all_targets); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), &newval); if (!valptr) { pr_err("Error: invalid arp_all_targets_value \"%s\"\n", arp_all_targets); arp_all_targets_value = 0; } else { arp_all_targets_value = valptr->value; } } if (miimon) { pr_info("MII link monitoring set to %d ms\n", miimon); } else if (arp_interval) { valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE, arp_validate_value); pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):", arp_interval, valptr->string, arp_ip_count); for (i = 0; i < arp_ip_count; i++) pr_cont(" %s", arp_ip_target[i]); pr_cont("\n"); } else if (max_bonds) { /* miimon and arp_interval not set, we need one so things * work as expected, see bonding.txt for details */ pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n"); } if (primary && !bond_mode_uses_primary(bond_mode)) { /* currently, using a primary only makes sense * in active backup, TLB or ALB modes */ pr_warn("Warning: %s primary device specified but has no effect in %s mode\n", primary, bond_mode_name(bond_mode)); primary = NULL; } if (primary && primary_reselect) { bond_opt_initstr(&newval, primary_reselect); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT), &newval); if (!valptr) { pr_err("Error: Invalid primary_reselect \"%s\"\n", primary_reselect); return -EINVAL; } primary_reselect_value = valptr->value; } else { primary_reselect_value = BOND_PRI_RESELECT_ALWAYS; } if (fail_over_mac) { bond_opt_initstr(&newval, fail_over_mac); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC), &newval); if (!valptr) { pr_err("Error: invalid fail_over_mac \"%s\"\n", fail_over_mac); return -EINVAL; } fail_over_mac_value = valptr->value; if (bond_mode != BOND_MODE_ACTIVEBACKUP) pr_warn("Warning: fail_over_mac only affects active-backup mode\n"); } else { fail_over_mac_value = BOND_FOM_NONE; } bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse( bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO), &newval); if (!valptr) { pr_err("Error: No ad_actor_sys_prio default value"); return -EINVAL; } ad_actor_sys_prio = valptr->value; valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY), &newval); if (!valptr) { pr_err("Error: No ad_user_port_key default value"); return -EINVAL; } ad_user_port_key = valptr->value; bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval); if (!valptr) { pr_err("Error: No tlb_dynamic_lb default value"); return -EINVAL; } tlb_dynamic_lb = valptr->value; if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; } /* fill params struct with the proper values */ params->mode = bond_mode; params->xmit_policy = xmit_hashtype; params->miimon = miimon; params->num_peer_notif = num_peer_notif; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; params->arp_all_targets = arp_all_targets_value; params->missed_max = 2; params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; params->use_carrier = use_carrier; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; params->primary_reselect = primary_reselect_value; params->fail_over_mac = fail_over_mac_value; params->tx_queues = tx_queues; params->all_slaves_active = all_slaves_active; params->resend_igmp = resend_igmp; params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; params->coupled_control = 1; if (packets_per_slave > 0) { params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ params->reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } if (primary) strscpy_pad(params->primary, primary, sizeof(params->primary)); memcpy(params->arp_targets, arp_target, sizeof(arp_target)); #if IS_ENABLED(CONFIG_IPV6) memset(params->ns_targets, 0, sizeof(struct in6_addr) * BOND_MAX_NS_TARGETS); #endif return 0; } /* Called from registration process */ static int bond_init(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); netdev_dbg(bond_dev, "Begin bond_init\n"); bond->wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, bond_dev->name); if (!bond->wq) return -ENOMEM; bond->notifier_ctx = false; spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev); list_add_tail_rcu(&bond->bond_list, &bn->dev_list); bond_prepare_sysfs_group(bond); bond_debug_register(bond); /* Ensure valid dev_addr */ if (is_zero_ether_addr(bond_dev->dev_addr) && bond_dev->addr_assign_type == NET_ADDR_PERM) eth_hw_addr_random(bond_dev); return 0; } unsigned int bond_get_num_tx_queues(void) { return tx_queues; } /* Create a new bond based on the specified name and bonding parameters. * If name is NULL, obtain a suitable "bond%d" name for us. * Caller must NOT hold rtnl_lock; we need to release it here before we * set up our sysfs entries. */ int bond_create(struct net *net, const char *name) { struct net_device *bond_dev; struct bonding *bond; int res = -ENOMEM; rtnl_lock(); bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "bond%d", NET_NAME_UNKNOWN, bond_setup, tx_queues); if (!bond_dev) goto out; bond = netdev_priv(bond_dev); dev_net_set(bond_dev, net); bond_dev->rtnl_link_ops = &bond_link_ops; res = register_netdevice(bond_dev); if (res < 0) { free_netdev(bond_dev); goto out; } netif_carrier_off(bond_dev); bond_work_init_all(bond); out: rtnl_unlock(); return res; } static int __net_init bond_net_init(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bn->net = net; INIT_LIST_HEAD(&bn->dev_list); bond_create_proc_dir(bn); bond_create_sysfs(bn); return 0; } /* According to commit 69b0216ac255 ("bonding: fix bonding_masters * race condition in bond unloading") we need to remove sysfs files * before we remove our devices (done later in bond_net_exit_rtnl()) */ static void __net_exit bond_net_pre_exit(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bond_destroy_sysfs(bn); } static void __net_exit bond_net_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { struct bond_net *bn = net_generic(net, bond_net_id); struct bonding *bond, *tmp_bond; /* Kill off any bonds created after unregistering bond rtnl ops */ list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) unregister_netdevice_queue(bond->dev, dev_kill_list); } /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory * only after all bonds are gone") bond_destroy_proc_dir() is called * after bond_net_exit_rtnl() has completed. */ static void __net_exit bond_net_exit_batch(struct list_head *net_list) { struct bond_net *bn; struct net *net; list_for_each_entry(net, net_list, exit_list) { bn = net_generic(net, bond_net_id); bond_destroy_proc_dir(bn); } } static struct pernet_operations bond_net_ops = { .init = bond_net_init, .pre_exit = bond_net_pre_exit, .exit_rtnl = bond_net_exit_rtnl, .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), }; static int __init bonding_init(void) { int i; int res; res = bond_check_params(&bonding_defaults); if (res) goto out; bond_create_debugfs(); res = register_pernet_subsys(&bond_net_ops); if (res) goto err_net_ops; res = bond_netlink_init(); if (res) goto err_link; for (i = 0; i < max_bonds; i++) { res = bond_create(&init_net, NULL); if (res) goto err; } skb_flow_dissector_init(&flow_keys_bonding, flow_keys_bonding_keys, ARRAY_SIZE(flow_keys_bonding_keys)); register_netdevice_notifier(&bond_netdev_notifier); out: return res; err: bond_netlink_fini(); err_link: unregister_pernet_subsys(&bond_net_ops); err_net_ops: bond_destroy_debugfs(); goto out; } static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); bond_destroy_debugfs(); #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); #endif } module_init(bonding_init); module_exit(bonding_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");
1 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm_device.c - IPsec device offloading code. * * Copyright (c) 2015 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/errno.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/gso.h> #include <net/xfrm.h> #include <linux/notifier.h> #ifdef CONFIG_XFRM_OFFLOAD static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); skb_reset_mac_len(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header -= x->props.header_len; pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len); } static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len - x->props.enc_hdr_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); int phlen = 0; if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); if (x->sel.family != AF_INET6) { phlen = IPV4_BEET_PHMAXLEN; if (x->outer_mode.family == AF_INET6) phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr); } pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen)); } /* Adjust pointers into the packet when IPsec is done at layer2 */ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { case XFRM_MODE_IPTFS: case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_TRANSPORT: if (x->outer_mode.family == AF_INET) return __xfrm_transport_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_transport_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_BEET: if (x->outer_mode.family == AF_INET) return __xfrm_mode_beet_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_beet_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: break; } } static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); __u32 seq = xo->seq.low; seq += skb_shinfo(skb)->gso_segs; if (unlikely(seq < xo->seq.low)) return true; return false; } struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; unsigned long flags; struct xfrm_state *x; struct softnet_data *sd; struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct net_device *dev = skb->dev; struct sec_path *sp; if (!xo || (xo->flags & XFRM_XMIT)) return skb; if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN) return skb; /* The packet was sent to HW IPsec packet offload engine, * but to wrong device. Drop the packet, so it won't skip * XFRM stack. */ if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) { kfree_skb(skb); dev_core_stats_tx_dropped_inc(dev); return NULL; } local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); local_irq_restore(flags); if (err) { *again = true; return skb; } if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); segs = skb_gso_segment(skb, esp_features); if (IS_ERR(segs)) { kfree_skb(skb); dev_core_stats_tx_dropped_inc(dev); return NULL; } else { consume_skb(skb); skb = segs; } } if (!skb->next) { esp_features |= skb->dev->gso_partial_features; xfrm_outer_mode_prep(x, skb); xo->flags |= XFRM_DEV_RESUME; err = x->type_offload->xmit(x, skb, esp_features); if (err) { if (err == -EINPROGRESS) return NULL; XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); kfree_skb(skb); return NULL; } skb_push(skb, skb->data - skb_mac_header(skb)); return skb; } skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; xfrm_outer_mode_prep(x, skb2); err = x->type_offload->xmit(x, skb2, esp_features); if (!err) { skb2->next = nskb; } else if (err != -EINPROGRESS) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); skb2->next = nskb; kfree_skb_list(skb2); return NULL; } else { if (skb == skb2) skb = nskb; else pskb->next = nskb; continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); pskb = skb2; } return skb; } EXPORT_SYMBOL_GPL(validate_xmit_xfrm); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack) { int err; struct dst_entry *dst; struct net_device *dev; struct xfrm_dev_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; bool is_packet_offload; if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) || (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) { NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction"); return -EINVAL; } if (xuo->flags & XFRM_OFFLOAD_INBOUND && x->if_id) { NL_SET_ERR_MSG(extack, "XFRM if_id is not supported in RX path"); return -EINVAL; } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support TFC padding. */ if (x->tfcpad) { NL_SET_ERR_MSG(extack, "TFC padding can't be offloaded"); return -EINVAL; } dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { struct xfrm_dst_lookup_params params; if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { saddr = &x->props.saddr; daddr = &x->id.daddr; } else { saddr = &x->id.daddr; daddr = &x->props.saddr; } memset(&params, 0, sizeof(params)); params.net = net; params.saddr = saddr; params.daddr = daddr; params.mark = xfrm_smark_get(0, x); dst = __xfrm_dst_lookup(x->props.family, &params); if (IS_ERR(dst)) return (is_packet_offload) ? -EINVAL : 0; dev = dst->dev; dev_hold(dev); dst_release(dst); } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { xso->dev = NULL; dev_put(dev); return (is_packet_offload) ? -EINVAL : 0; } if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN && !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN"); xso->dev = NULL; dev_put(dev); return -EINVAL; } xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); return -EINVAL; } xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); if (xuo->flags & XFRM_OFFLOAD_INBOUND) xso->dir = XFRM_DEV_OFFLOAD_IN; else xso->dir = XFRM_DEV_OFFLOAD_OUT; if (is_packet_offload) xso->type = XFRM_DEV_OFFLOAD_PACKET; else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack); if (err) { xso->dev = NULL; xso->dir = 0; netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xfrm_unset_type_offload(x); /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. */ if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } } return 0; } EXPORT_SYMBOL_GPL(xfrm_dev_state_add); int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, struct xfrm_user_offload *xuo, u8 dir, struct netlink_ext_ack *extack) { struct xfrm_dev_offload *xdo = &xp->xdo; struct net_device *dev; int err; if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) { /* We support only packet offload mode and it means * that user must set XFRM_OFFLOAD_PACKET bit. */ NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } dev = dev_get_by_index(net, xuo->ifindex); if (!dev) return -EINVAL; if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) { xdo->dev = NULL; dev_put(dev); NL_SET_ERR_MSG(extack, "Policy offload is not supported"); return -EINVAL; } xdo->dev = dev; netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); xdo->type = XFRM_DEV_OFFLOAD_PACKET; switch (dir) { case XFRM_POLICY_IN: xdo->dir = XFRM_DEV_OFFLOAD_IN; break; case XFRM_POLICY_OUT: xdo->dir = XFRM_DEV_OFFLOAD_OUT; break; case XFRM_POLICY_FWD: xdo->dir = XFRM_DEV_OFFLOAD_FWD; break; default: xdo->dev = NULL; netdev_put(dev, &xdo->dev_tracker); NL_SET_ERR_MSG(extack, "Unrecognized offload direction"); return -EINVAL; } err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy"); return err; } return 0; } EXPORT_SYMBOL_GPL(xfrm_dev_policy_add); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { int mtu; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; bool check_tunnel_size; if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) return false; if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) goto ok; } return false; ok: check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; switch (x->props.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) return false; if (check_tunnel_size && xfrm4_tunnel_check_size(skb)) return false; break; case AF_INET6: /* Check for IPv6 extensions */ if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) return false; if (check_tunnel_size && xfrm6_tunnel_check_size(skb)) return false; break; default: break; } if (dev->xfrmdev_ops->xdo_dev_offload_ok) return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); void xfrm_dev_resume(struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret = NETDEV_TX_BUSY; struct netdev_queue *txq; struct softnet_data *sd; unsigned long flags; rcu_read_lock(); txq = netdev_core_pick_tx(dev, skb, NULL); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); if (!dev_xmit_complete(ret)) { local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); skb_queue_tail(&sd->xfrm_backlog, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xfrm_dev_resume); void xfrm_dev_backlog(struct softnet_data *sd) { struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog; struct sk_buff_head list; struct sk_buff *skb; if (skb_queue_empty(xfrm_backlog)) return; __skb_queue_head_init(&list); spin_lock(&xfrm_backlog->lock); skb_queue_splice_init(xfrm_backlog, &list); spin_unlock(&xfrm_backlog->lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); xfrm_dev_resume(skb); } } #endif static int xfrm_api_check(struct net_device *dev) { #ifdef CONFIG_XFRM_OFFLOAD if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && !(dev->features & NETIF_F_HW_ESP)) return NOTIFY_BAD; if ((dev->features & NETIF_F_HW_ESP) && (!(dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_add && dev->xfrmdev_ops->xdo_dev_state_delete))) return NOTIFY_BAD; #else if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM)) return NOTIFY_BAD; #endif return NOTIFY_DONE; } static int xfrm_dev_down(struct net_device *dev) { if (dev->features & NETIF_F_HW_ESP) { xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_dev_policy_flush(dev_net(dev), dev, true); } return NOTIFY_DONE; } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: return xfrm_api_check(dev); case NETDEV_FEAT_CHANGE: return xfrm_api_check(dev); case NETDEV_DOWN: case NETDEV_UNREGISTER: return xfrm_dev_down(dev); } return NOTIFY_DONE; } static struct notifier_block xfrm_dev_notifier = { .notifier_call = xfrm_dev_event, }; void __init xfrm_dev_init(void) { register_netdevice_notifier(&xfrm_dev_notifier); }
69 69 223 224 224 218 18 224 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { struct pid *pid = task_pid(p); nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); wake_up_all(&pid->wait_pidfd); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); pidfs_exit(p); cgroup_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk, struct core_state *core_state) { struct core_thread self; self.task = tsk; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ tsk->flags |= PF_POSTCOREDUMP; core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); if (unlikely(core_state)) coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); /* * Since sampling can touch ->mm, make sure to stop everything before we * tear it down. * * Also flushes inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); exit_mm(); if (group_dead) acct_process(); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); void ksm_add_vma(struct vm_area_struct *vma); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern atomic_long_t ksm_zero_pages; static inline void ksm_map_zero_page(struct mm_struct *mm) { atomic_long_inc(&ksm_zero_pages); atomic_long_inc(&mm->ksm_zero_pages); } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { atomic_long_dec(&ksm_zero_pages); atomic_long_dec(&mm->ksm_zero_pages); } } static inline long mm_ksm_zero_pages(struct mm_struct *mm) { return atomic_long_read(&mm->ksm_zero_pages); } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) { } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline int ksm_execve(struct mm_struct *mm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } static inline void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { return folio; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
225 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else /* S_ISCHR by the test above */ type = DEVCG_DEV_CHAR; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif
1452 1453 1455 1451 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */
535 86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 // SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/clocksource/arm_arch_timer.c * * Copyright (C) 2011 ARM Ltd. * All Rights Reserved */ #define pr_fmt(fmt) "arch_timer: " fmt #include <linux/init.h> #include <linux/kernel.h> #include <linux/device.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/clockchips.h> #include <linux/clocksource.h> #include <linux/clocksource_ids.h> #include <linux/interrupt.h> #include <linux/kstrtox.h> #include <linux/of_irq.h> #include <linux/of_address.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/sched_clock.h> #include <linux/acpi.h> #include <linux/arm-smccc.h> #include <linux/ptp_kvm.h> #include <asm/arch_timer.h> #include <asm/virt.h> #include <clocksource/arm_arch_timer.h> #define CNTTIDR 0x08 #define CNTTIDR_VIRT(n) (BIT(1) << ((n) * 4)) #define CNTACR(n) (0x40 + ((n) * 4)) #define CNTACR_RPCT BIT(0) #define CNTACR_RVCT BIT(1) #define CNTACR_RFRQ BIT(2) #define CNTACR_RVOFF BIT(3) #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5) #define CNTPCT_LO 0x00 #define CNTVCT_LO 0x08 #define CNTFRQ 0x10 #define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c #define CNTV_CVAL_LO 0x30 #define CNTV_CTL 0x3c /* * The minimum amount of time a generic counter is guaranteed to not roll over * (40 years) */ #define MIN_ROLLOVER_SECS (40ULL * 365 * 24 * 3600) static unsigned arch_timers_present __initdata; struct arch_timer { void __iomem *base; struct clock_event_device evt; }; static struct arch_timer *arch_timer_mem __ro_after_init; #define to_arch_timer(e) container_of(e, struct arch_timer, evt) static u32 arch_timer_rate __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; static const char *arch_timer_ppi_names[ARCH_TIMER_MAX_TIMER_PPI] = { [ARCH_TIMER_PHYS_SECURE_PPI] = "sec-phys", [ARCH_TIMER_PHYS_NONSECURE_PPI] = "phys", [ARCH_TIMER_VIRT_PPI] = "virt", [ARCH_TIMER_HYP_PPI] = "hyp-phys", [ARCH_TIMER_HYP_VIRT_PPI] = "hyp-virt", }; static struct clock_event_device __percpu *arch_timer_evt; static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI; static bool arch_timer_c3stop __ro_after_init; static bool arch_timer_mem_use_virtual __ro_after_init; static bool arch_counter_suspend_stop __ro_after_init; #ifdef CONFIG_GENERIC_GETTIMEOFDAY static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER; #else static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE; #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ static cpumask_t evtstrm_available = CPU_MASK_NONE; static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM); static int __init early_evtstrm_cfg(char *buf) { return kstrtobool(buf, &evtstrm_enable); } early_param("clocksource.arm_arch_timer.evtstrm", early_evtstrm_cfg); /* * Makes an educated guess at a valid counter width based on the Generic Timer * specification. Of note: * 1) the system counter is at least 56 bits wide * 2) a roll-over time of not less than 40 years * * See 'ARM DDI 0487G.a D11.1.2 ("The system counter")' for more details. */ static int arch_counter_get_width(void) { u64 min_cycles = MIN_ROLLOVER_SECS * arch_timer_rate; /* guarantee the returned width is within the valid range */ return clamp_val(ilog2(min_cycles - 1) + 1, 56, 64); } /* * Architected system timer support. */ static __always_inline void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val, struct clock_event_device *clk) { if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: writel_relaxed((u32)val, timer->base + CNTP_CTL); break; case ARCH_TIMER_REG_CVAL: /* * Not guaranteed to be atomic, so the timer * must be disabled at this point. */ writeq_relaxed(val, timer->base + CNTP_CVAL_LO); break; default: BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: writel_relaxed((u32)val, timer->base + CNTV_CTL); break; case ARCH_TIMER_REG_CVAL: /* Same restriction as above */ writeq_relaxed(val, timer->base + CNTV_CVAL_LO); break; default: BUILD_BUG(); } } else { arch_timer_reg_write_cp15(access, reg, val); } } static __always_inline u32 arch_timer_reg_read(int access, enum arch_timer_reg reg, struct clock_event_device *clk) { u32 val; if (access == ARCH_TIMER_MEM_PHYS_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTP_CTL); break; default: BUILD_BUG(); } } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) { struct arch_timer *timer = to_arch_timer(clk); switch (reg) { case ARCH_TIMER_REG_CTRL: val = readl_relaxed(timer->base + CNTV_CTL); break; default: BUILD_BUG(); } } else { val = arch_timer_reg_read_cp15(access, reg); } return val; } static noinstr u64 raw_counter_get_cntpct_stable(void) { return __arch_counter_get_cntpct_stable(); } static notrace u64 arch_counter_get_cntpct_stable(void) { u64 val; preempt_disable_notrace(); val = __arch_counter_get_cntpct_stable(); preempt_enable_notrace(); return val; } static noinstr u64 arch_counter_get_cntpct(void) { return __arch_counter_get_cntpct(); } static noinstr u64 raw_counter_get_cntvct_stable(void) { return __arch_counter_get_cntvct_stable(); } static notrace u64 arch_counter_get_cntvct_stable(void) { u64 val; preempt_disable_notrace(); val = __arch_counter_get_cntvct_stable(); preempt_enable_notrace(); return val; } static noinstr u64 arch_counter_get_cntvct(void) { return __arch_counter_get_cntvct(); } /* * Default to cp15 based access because arm64 uses this function for * sched_clock() before DT is probed and the cp15 method is guaranteed * to exist on arm64. arm doesn't use this before DT is probed so even * if we don't have the cp15 accessors we won't have a problem. */ u64 (*arch_timer_read_counter)(void) __ro_after_init = arch_counter_get_cntvct; EXPORT_SYMBOL_GPL(arch_timer_read_counter); static u64 arch_counter_read(struct clocksource *cs) { return arch_timer_read_counter(); } static u64 arch_counter_read_cc(const struct cyclecounter *cc) { return arch_timer_read_counter(); } static struct clocksource clocksource_counter = { .name = "arch_sys_counter", .id = CSID_ARM_ARCH_COUNTER, .rating = 400, .read = arch_counter_read, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static struct cyclecounter cyclecounter __ro_after_init = { .read = arch_counter_read_cc, }; struct ate_acpi_oem_info { char oem_id[ACPI_OEM_ID_SIZE + 1]; char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; u32 oem_revision; }; #ifdef CONFIG_FSL_ERRATUM_A008585 /* * The number of retries is an arbitrary value well beyond the highest number * of iterations the loop has been observed to take. */ #define __fsl_a008585_read_reg(reg) ({ \ u64 _old, _new; \ int _retries = 200; \ \ do { \ _old = read_sysreg(reg); \ _new = read_sysreg(reg); \ _retries--; \ } while (unlikely(_old != _new) && _retries); \ \ WARN_ON_ONCE(!_retries); \ _new; \ }) static u64 notrace fsl_a008585_read_cntpct_el0(void) { return __fsl_a008585_read_reg(cntpct_el0); } static u64 notrace fsl_a008585_read_cntvct_el0(void) { return __fsl_a008585_read_reg(cntvct_el0); } #endif #ifdef CONFIG_HISILICON_ERRATUM_161010101 /* * Verify whether the value of the second read is larger than the first by * less than 32 is the only way to confirm the value is correct, so clear the * lower 5 bits to check whether the difference is greater than 32 or not. * Theoretically the erratum should not occur more than twice in succession * when reading the system counter, but it is possible that some interrupts * may lead to more than twice read errors, triggering the warning, so setting * the number of retries far beyond the number of iterations the loop has been * observed to take. */ #define __hisi_161010101_read_reg(reg) ({ \ u64 _old, _new; \ int _retries = 50; \ \ do { \ _old = read_sysreg(reg); \ _new = read_sysreg(reg); \ _retries--; \ } while (unlikely((_new - _old) >> 5) && _retries); \ \ WARN_ON_ONCE(!_retries); \ _new; \ }) static u64 notrace hisi_161010101_read_cntpct_el0(void) { return __hisi_161010101_read_reg(cntpct_el0); } static u64 notrace hisi_161010101_read_cntvct_el0(void) { return __hisi_161010101_read_reg(cntvct_el0); } static const struct ate_acpi_oem_info hisi_161010101_oem_info[] = { /* * Note that trailing spaces are required to properly match * the OEM table information. */ { .oem_id = "HISI ", .oem_table_id = "HIP05 ", .oem_revision = 0, }, { .oem_id = "HISI ", .oem_table_id = "HIP06 ", .oem_revision = 0, }, { .oem_id = "HISI ", .oem_table_id = "HIP07 ", .oem_revision = 0, }, { /* Sentinel indicating the end of the OEM array */ }, }; #endif #ifdef CONFIG_ARM64_ERRATUM_858921 static u64 notrace arm64_858921_read_cntpct_el0(void) { u64 old, new; old = read_sysreg(cntpct_el0); new = read_sysreg(cntpct_el0); return (((old ^ new) >> 32) & 1) ? old : new; } static u64 notrace arm64_858921_read_cntvct_el0(void) { u64 old, new; old = read_sysreg(cntvct_el0); new = read_sysreg(cntvct_el0); return (((old ^ new) >> 32) & 1) ? old : new; } #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 /* * The low bits of the counter registers are indeterminate while bit 10 or * greater is rolling over. Since the counter value can jump both backward * (7ff -> 000 -> 800) and forward (7ff -> fff -> 800), ignore register values * with all ones or all zeros in the low bits. Bound the loop by the maximum * number of CPU cycles in 3 consecutive 24 MHz counter periods. */ #define __sun50i_a64_read_reg(reg) ({ \ u64 _val; \ int _retries = 150; \ \ do { \ _val = read_sysreg(reg); \ _retries--; \ } while (((_val + 1) & GENMASK(8, 0)) <= 1 && _retries); \ \ WARN_ON_ONCE(!_retries); \ _val; \ }) static u64 notrace sun50i_a64_read_cntpct_el0(void) { return __sun50i_a64_read_reg(cntpct_el0); } static u64 notrace sun50i_a64_read_cntvct_el0(void) { return __sun50i_a64_read_reg(cntvct_el0); } #endif #ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround); static atomic_t timer_unstable_counter_workaround_in_use = ATOMIC_INIT(0); /* * Force the inlining of this function so that the register accesses * can be themselves correctly inlined. */ static __always_inline void erratum_set_next_event_generic(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cval; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_PHYS_ACCESS) { cval = evt + arch_counter_get_cntpct_stable(); write_sysreg(cval, cntp_cval_el0); } else { cval = evt + arch_counter_get_cntvct_stable(); write_sysreg(cval, cntv_cval_el0); } arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static __maybe_unused int erratum_set_next_event_virt(unsigned long evt, struct clock_event_device *clk) { erratum_set_next_event_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk); return 0; } static __maybe_unused int erratum_set_next_event_phys(unsigned long evt, struct clock_event_device *clk) { erratum_set_next_event_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk); return 0; } static const struct arch_timer_erratum_workaround ool_workarounds[] = { #ifdef CONFIG_FSL_ERRATUM_A008585 { .match_type = ate_match_dt, .id = "fsl,erratum-a008585", .desc = "Freescale erratum a005858", .read_cntpct_el0 = fsl_a008585_read_cntpct_el0, .read_cntvct_el0 = fsl_a008585_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_HISILICON_ERRATUM_161010101 { .match_type = ate_match_dt, .id = "hisilicon,erratum-161010101", .desc = "HiSilicon erratum 161010101", .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, { .match_type = ate_match_acpi_oem_info, .id = hisi_161010101_oem_info, .desc = "HiSilicon erratum 161010101", .read_cntpct_el0 = hisi_161010101_read_cntpct_el0, .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_858921 { .match_type = ate_match_local_cap_id, .id = (void *)ARM64_WORKAROUND_858921, .desc = "ARM erratum 858921", .read_cntpct_el0 = arm64_858921_read_cntpct_el0, .read_cntvct_el0 = arm64_858921_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 { .match_type = ate_match_dt, .id = "allwinner,erratum-unknown1", .desc = "Allwinner erratum UNKNOWN1", .read_cntpct_el0 = sun50i_a64_read_cntpct_el0, .read_cntvct_el0 = sun50i_a64_read_cntvct_el0, .set_next_event_phys = erratum_set_next_event_phys, .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_ARM64_ERRATUM_1418040 { .match_type = ate_match_local_cap_id, .id = (void *)ARM64_WORKAROUND_1418040, .desc = "ARM erratum 1418040", .disable_compat_vdso = true, }, #endif }; typedef bool (*ate_match_fn_t)(const struct arch_timer_erratum_workaround *, const void *); static bool arch_timer_check_dt_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { const struct device_node *np = arg; return of_property_read_bool(np, wa->id); } static bool arch_timer_check_local_cap_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { return this_cpu_has_cap((uintptr_t)wa->id); } static bool arch_timer_check_acpi_oem_erratum(const struct arch_timer_erratum_workaround *wa, const void *arg) { static const struct ate_acpi_oem_info empty_oem_info = {}; const struct ate_acpi_oem_info *info = wa->id; const struct acpi_table_header *table = arg; /* Iterate over the ACPI OEM info array, looking for a match */ while (memcmp(info, &empty_oem_info, sizeof(*info))) { if (!memcmp(info->oem_id, table->oem_id, ACPI_OEM_ID_SIZE) && !memcmp(info->oem_table_id, table->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) && info->oem_revision == table->oem_revision) return true; info++; } return false; } static const struct arch_timer_erratum_workaround * arch_timer_iterate_errata(enum arch_timer_erratum_match_type type, ate_match_fn_t match_fn, void *arg) { int i; for (i = 0; i < ARRAY_SIZE(ool_workarounds); i++) { if (ool_workarounds[i].match_type != type) continue; if (match_fn(&ool_workarounds[i], arg)) return &ool_workarounds[i]; } return NULL; } static void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa, bool local) { int i; if (local) { __this_cpu_write(timer_unstable_counter_workaround, wa); } else { for_each_possible_cpu(i) per_cpu(timer_unstable_counter_workaround, i) = wa; } if (wa->read_cntvct_el0 || wa->read_cntpct_el0) atomic_set(&timer_unstable_counter_workaround_in_use, 1); /* * Don't use the vdso fastpath if errata require using the * out-of-line counter accessor. We may change our mind pretty * late in the game (with a per-CPU erratum, for example), so * change both the default value and the vdso itself. */ if (wa->read_cntvct_el0) { clocksource_counter.vdso_clock_mode = VDSO_CLOCKMODE_NONE; vdso_default = VDSO_CLOCKMODE_NONE; } else if (wa->disable_compat_vdso && vdso_default != VDSO_CLOCKMODE_NONE) { vdso_default = VDSO_CLOCKMODE_ARCHTIMER_NOCOMPAT; clocksource_counter.vdso_clock_mode = vdso_default; } } static void arch_timer_check_ool_workaround(enum arch_timer_erratum_match_type type, void *arg) { const struct arch_timer_erratum_workaround *wa, *__wa; ate_match_fn_t match_fn = NULL; bool local = false; switch (type) { case ate_match_dt: match_fn = arch_timer_check_dt_erratum; break; case ate_match_local_cap_id: match_fn = arch_timer_check_local_cap_erratum; local = true; break; case ate_match_acpi_oem_info: match_fn = arch_timer_check_acpi_oem_erratum; break; default: WARN_ON(1); return; } wa = arch_timer_iterate_errata(type, match_fn, arg); if (!wa) return; __wa = __this_cpu_read(timer_unstable_counter_workaround); if (__wa && wa != __wa) pr_warn("Can't enable workaround for %s (clashes with %s\n)", wa->desc, __wa->desc); if (__wa) return; arch_timer_enable_workaround(wa, local); pr_info("Enabling %s workaround for %s\n", local ? "local" : "global", wa->desc); } static bool arch_timer_this_cpu_has_cntvct_wa(void) { return has_erratum_handler(read_cntvct_el0); } static bool arch_timer_counter_has_wa(void) { return atomic_read(&timer_unstable_counter_workaround_in_use); } #else #define arch_timer_check_ool_workaround(t,a) do { } while(0) #define arch_timer_this_cpu_has_cntvct_wa() ({false;}) #define arch_timer_counter_has_wa() ({false;}) #endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */ static __always_inline irqreturn_t timer_handler(const int access, struct clock_event_device *evt) { unsigned long ctrl; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt); if (ctrl & ARCH_TIMER_CTRL_IT_STAT) { ctrl |= ARCH_TIMER_CTRL_IT_MASK; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt); evt->event_handler(evt); return IRQ_HANDLED; } return IRQ_NONE; } static irqreturn_t arch_timer_handler_virt(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_VIRT_ACCESS, evt); } static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt); } static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt); } static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt); } static __always_inline int arch_timer_shutdown(const int access, struct clock_event_device *clk) { unsigned long ctrl; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl &= ~ARCH_TIMER_CTRL_ENABLE; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); return 0; } static int arch_timer_shutdown_virt(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_VIRT_ACCESS, clk); } static int arch_timer_shutdown_phys(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk); } static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk); } static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk) { return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk); } static __always_inline void set_next_event(const int access, unsigned long evt, struct clock_event_device *clk) { unsigned long ctrl; u64 cnt; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_PHYS_ACCESS) cnt = __arch_counter_get_cntpct(); else cnt = __arch_counter_get_cntvct(); arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static int arch_timer_set_next_event_virt(unsigned long evt, struct clock_event_device *clk) { set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk); return 0; } static int arch_timer_set_next_event_phys(unsigned long evt, struct clock_event_device *clk) { set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk); return 0; } static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo) { u32 cnt_lo, cnt_hi, tmp_hi; do { cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo)); tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4)); } while (cnt_hi != tmp_hi); return ((u64) cnt_hi << 32) | cnt_lo; } static __always_inline void set_next_event_mem(const int access, unsigned long evt, struct clock_event_device *clk) { struct arch_timer *timer = to_arch_timer(clk); unsigned long ctrl; u64 cnt; ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); /* Timer must be disabled before programming CVAL */ if (ctrl & ARCH_TIMER_CTRL_ENABLE) { ctrl &= ~ARCH_TIMER_CTRL_ENABLE; arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } ctrl |= ARCH_TIMER_CTRL_ENABLE; ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; if (access == ARCH_TIMER_MEM_VIRT_ACCESS) cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO); else cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO); arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk); arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); } static int arch_timer_set_next_event_virt_mem(unsigned long evt, struct clock_event_device *clk) { set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk); return 0; } static int arch_timer_set_next_event_phys_mem(unsigned long evt, struct clock_event_device *clk) { set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk); return 0; } static u64 __arch_timer_check_delta(void) { #ifdef CONFIG_ARM64 const struct midr_range broken_cval_midrs[] = { /* * XGene-1 implements CVAL in terms of TVAL, meaning * that the maximum timer range is 32bit. Shame on them. * * Note that TVAL is signed, thus has only 31 of its * 32 bits to express magnitude. */ MIDR_REV_RANGE(MIDR_CPU_MODEL(ARM_CPU_IMP_APM, APM_CPU_PART_XGENE), APM_CPU_VAR_POTENZA, 0x0, 0xf), {}, }; if (is_midr_in_range_list(broken_cval_midrs)) { pr_warn_once("Broken CNTx_CVAL_EL1, using 31 bit TVAL instead.\n"); return CLOCKSOURCE_MASK(31); } #endif return CLOCKSOURCE_MASK(arch_counter_get_width()); } static void __arch_timer_setup(unsigned type, struct clock_event_device *clk) { u64 max_delta; clk->features = CLOCK_EVT_FEAT_ONESHOT; if (type == ARCH_TIMER_TYPE_CP15) { typeof(clk->set_next_event) sne; arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL); if (arch_timer_c3stop) clk->features |= CLOCK_EVT_FEAT_C3STOP; clk->name = "arch_sys_timer"; clk->rating = 450; clk->cpumask = cpumask_of(smp_processor_id()); clk->irq = arch_timer_ppi[arch_timer_uses_ppi]; switch (arch_timer_uses_ppi) { case ARCH_TIMER_VIRT_PPI: clk->set_state_shutdown = arch_timer_shutdown_virt; clk->set_state_oneshot_stopped = arch_timer_shutdown_virt; sne = erratum_handler(set_next_event_virt); break; case ARCH_TIMER_PHYS_SECURE_PPI: case ARCH_TIMER_PHYS_NONSECURE_PPI: case ARCH_TIMER_HYP_PPI: clk->set_state_shutdown = arch_timer_shutdown_phys; clk->set_state_oneshot_stopped = arch_timer_shutdown_phys; sne = erratum_handler(set_next_event_phys); break; default: BUG(); } clk->set_next_event = sne; max_delta = __arch_timer_check_delta(); } else { clk->features |= CLOCK_EVT_FEAT_DYNIRQ; clk->name = "arch_mem_timer"; clk->rating = 400; clk->cpumask = cpu_possible_mask; if (arch_timer_mem_use_virtual) { clk->set_state_shutdown = arch_timer_shutdown_virt_mem; clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem; clk->set_next_event = arch_timer_set_next_event_virt_mem; } else { clk->set_state_shutdown = arch_timer_shutdown_phys_mem; clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem; clk->set_next_event = arch_timer_set_next_event_phys_mem; } max_delta = CLOCKSOURCE_MASK(56); } clk->set_state_shutdown(clk); clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta); } static void arch_timer_evtstrm_enable(unsigned int divider) { u32 cntkctl = arch_timer_get_cntkctl(); #ifdef CONFIG_ARM64 /* ECV is likely to require a large divider. Use the EVNTIS flag. */ if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) { cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE; divider -= 8; } #endif divider = min(divider, 15U); cntkctl &= ~ARCH_TIMER_EVT_TRIGGER_MASK; /* Set the divider and enable virtual event stream */ cntkctl |= (divider << ARCH_TIMER_EVT_TRIGGER_SHIFT) | ARCH_TIMER_VIRT_EVT_EN; arch_timer_set_cntkctl(cntkctl); arch_timer_set_evtstrm_feature(); cpumask_set_cpu(smp_processor_id(), &evtstrm_available); } static void arch_timer_configure_evtstream(void) { int evt_stream_div, lsb; /* * As the event stream can at most be generated at half the frequency * of the counter, use half the frequency when computing the divider. */ evt_stream_div = arch_timer_rate / ARCH_TIMER_EVT_STREAM_FREQ / 2; /* * Find the closest power of two to the divisor. If the adjacent bit * of lsb (last set bit, starts from 0) is set, then we use (lsb + 1). */ lsb = fls(evt_stream_div) - 1; if (lsb > 0 && (evt_stream_div & BIT(lsb - 1))) lsb++; /* enable event stream */ arch_timer_evtstrm_enable(max(0, lsb)); } static int arch_timer_evtstrm_starting_cpu(unsigned int cpu) { arch_timer_configure_evtstream(); return 0; } static int arch_timer_evtstrm_dying_cpu(unsigned int cpu) { cpumask_clear_cpu(smp_processor_id(), &evtstrm_available); return 0; } static int __init arch_timer_evtstrm_register(void) { if (!arch_timer_evt || !evtstrm_enable) return 0; return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING, "clockevents/arm/arch_timer_evtstrm:starting", arch_timer_evtstrm_starting_cpu, arch_timer_evtstrm_dying_cpu); } core_initcall(arch_timer_evtstrm_register); static void arch_counter_set_user_access(void) { u32 cntkctl = arch_timer_get_cntkctl(); /* Disable user access to the timers and both counters */ /* Also disable virtual event stream */ cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN | ARCH_TIMER_USR_VT_ACCESS_EN | ARCH_TIMER_USR_VCT_ACCESS_EN | ARCH_TIMER_VIRT_EVT_EN | ARCH_TIMER_USR_PCT_ACCESS_EN); /* * Enable user access to the virtual counter if it doesn't * need to be workaround. The vdso may have been already * disabled though. */ if (arch_timer_this_cpu_has_cntvct_wa()) pr_info("CPU%d: Trapping CNTVCT access\n", smp_processor_id()); else cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN; arch_timer_set_cntkctl(cntkctl); } static bool arch_timer_has_nonsecure_ppi(void) { return (arch_timer_uses_ppi == ARCH_TIMER_PHYS_SECURE_PPI && arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); } static u32 check_ppi_trigger(int irq) { u32 flags = irq_get_trigger_type(irq); if (flags != IRQF_TRIGGER_HIGH && flags != IRQF_TRIGGER_LOW) { pr_warn("WARNING: Invalid trigger for IRQ%d, assuming level low\n", irq); pr_warn("WARNING: Please fix your firmware\n"); flags = IRQF_TRIGGER_LOW; } return flags; } static int arch_timer_starting_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); u32 flags; __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk); flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]); enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags); if (arch_timer_has_nonsecure_ppi()) { flags = check_ppi_trigger(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); enable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI], flags); } arch_counter_set_user_access(); return 0; } static int validate_timer_rate(void) { if (!arch_timer_rate) return -EINVAL; /* Arch timer frequency < 1MHz can cause trouble */ WARN_ON(arch_timer_rate < 1000000); return 0; } /* * For historical reasons, when probing with DT we use whichever (non-zero) * rate was probed first, and don't verify that others match. If the first node * probed has a clock-frequency property, this overrides the HW register. */ static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np) { /* Who has more than one independent system counter? */ if (arch_timer_rate) return; if (of_property_read_u32(np, "clock-frequency", &arch_timer_rate)) arch_timer_rate = rate; /* Check the timer frequency. */ if (validate_timer_rate()) pr_warn("frequency not available\n"); } static void __init arch_timer_banner(unsigned type) { pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n", type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "", type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? " and " : "", type & ARCH_TIMER_TYPE_MEM ? "mmio" : "", (unsigned long)arch_timer_rate / 1000000, (unsigned long)(arch_timer_rate / 10000) % 100, type & ARCH_TIMER_TYPE_CP15 ? (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" : "", type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "", type & ARCH_TIMER_TYPE_MEM ? arch_timer_mem_use_virtual ? "virt" : "phys" : ""); } u32 arch_timer_get_rate(void) { return arch_timer_rate; } bool arch_timer_evtstrm_available(void) { /* * We might get called from a preemptible context. This is fine * because availability of the event stream should be always the same * for a preemptible context and context where we might resume a task. */ return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available); } static noinstr u64 arch_counter_get_cntvct_mem(void) { return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO); } static struct arch_timer_kvm_info arch_timer_kvm_info; struct arch_timer_kvm_info *arch_timer_get_kvm_info(void) { return &arch_timer_kvm_info; } static void __init arch_counter_register(unsigned type) { u64 (*scr)(void); u64 start_count; int width; /* Register the CP15 based counter if we have one */ if (type & ARCH_TIMER_TYPE_CP15) { u64 (*rd)(void); if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) || arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) { if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntvct_stable; scr = raw_counter_get_cntvct_stable; } else { rd = arch_counter_get_cntvct; scr = arch_counter_get_cntvct; } } else { if (arch_timer_counter_has_wa()) { rd = arch_counter_get_cntpct_stable; scr = raw_counter_get_cntpct_stable; } else { rd = arch_counter_get_cntpct; scr = arch_counter_get_cntpct; } } arch_timer_read_counter = rd; clocksource_counter.vdso_clock_mode = vdso_default; } else { arch_timer_read_counter = arch_counter_get_cntvct_mem; scr = arch_counter_get_cntvct_mem; } width = arch_counter_get_width(); clocksource_counter.mask = CLOCKSOURCE_MASK(width); cyclecounter.mask = CLOCKSOURCE_MASK(width); if (!arch_counter_suspend_stop) clocksource_counter.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; start_count = arch_timer_read_counter(); clocksource_register_hz(&clocksource_counter, arch_timer_rate); cyclecounter.mult = clocksource_counter.mult; cyclecounter.shift = clocksource_counter.shift; timecounter_init(&arch_timer_kvm_info.timecounter, &cyclecounter, start_count); sched_clock_register(scr, width, arch_timer_rate); } static void arch_timer_stop(struct clock_event_device *clk) { pr_debug("disable IRQ%d cpu #%d\n", clk->irq, smp_processor_id()); disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]); if (arch_timer_has_nonsecure_ppi()) disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]); } static int arch_timer_dying_cpu(unsigned int cpu) { struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt); arch_timer_stop(clk); return 0; } #ifdef CONFIG_CPU_PM static DEFINE_PER_CPU(unsigned long, saved_cntkctl); static int arch_timer_cpu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { if (action == CPU_PM_ENTER) { __this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl()); cpumask_clear_cpu(smp_processor_id(), &evtstrm_available); } else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT) { arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl)); if (arch_timer_have_evtstrm_feature()) cpumask_set_cpu(smp_processor_id(), &evtstrm_available); } return NOTIFY_OK; } static struct notifier_block arch_timer_cpu_pm_notifier = { .notifier_call = arch_timer_cpu_pm_notify, }; static int __init arch_timer_cpu_pm_init(void) { return cpu_pm_register_notifier(&arch_timer_cpu_pm_notifier); } static void __init arch_timer_cpu_pm_deinit(void) { WARN_ON(cpu_pm_unregister_notifier(&arch_timer_cpu_pm_notifier)); } #else static int __init arch_timer_cpu_pm_init(void) { return 0; } static void __init arch_timer_cpu_pm_deinit(void) { } #endif static int __init arch_timer_register(void) { int err; int ppi; arch_timer_evt = alloc_percpu(struct clock_event_device); if (!arch_timer_evt) { err = -ENOMEM; goto out; } ppi = arch_timer_ppi[arch_timer_uses_ppi]; switch (arch_timer_uses_ppi) { case ARCH_TIMER_VIRT_PPI: err = request_percpu_irq(ppi, arch_timer_handler_virt, "arch_timer", arch_timer_evt); break; case ARCH_TIMER_PHYS_SECURE_PPI: case ARCH_TIMER_PHYS_NONSECURE_PPI: err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); if (!err && arch_timer_has_nonsecure_ppi()) { ppi = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); if (err) free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI], arch_timer_evt); } break; case ARCH_TIMER_HYP_PPI: err = request_percpu_irq(ppi, arch_timer_handler_phys, "arch_timer", arch_timer_evt); break; default: BUG(); } if (err) { pr_err("can't register interrupt %d (%d)\n", ppi, err); goto out_free; } err = arch_timer_cpu_pm_init(); if (err) goto out_unreg_notify; /* Register and immediately configure the timer on the boot CPU */ err = cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_STARTING, "clockevents/arm/arch_timer:starting", arch_timer_starting_cpu, arch_timer_dying_cpu); if (err) goto out_unreg_cpupm; return 0; out_unreg_cpupm: arch_timer_cpu_pm_deinit(); out_unreg_notify: free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt); if (arch_timer_has_nonsecure_ppi()) free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI], arch_timer_evt); out_free: free_percpu(arch_timer_evt); arch_timer_evt = NULL; out: return err; } static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq) { int ret; irq_handler_t func; arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL); if (!arch_timer_mem) return -ENOMEM; arch_timer_mem->base = base; arch_timer_mem->evt.irq = irq; __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt); if (arch_timer_mem_use_virtual) func = arch_timer_handler_virt_mem; else func = arch_timer_handler_phys_mem; ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt); if (ret) { pr_err("Failed to request mem timer irq\n"); kfree(arch_timer_mem); arch_timer_mem = NULL; } return ret; } static const struct of_device_id arch_timer_of_match[] __initconst = { { .compatible = "arm,armv7-timer", }, { .compatible = "arm,armv8-timer", }, {}, }; static const struct of_device_id arch_timer_mem_of_match[] __initconst = { { .compatible = "arm,armv7-timer-mem", }, {}, }; static bool __init arch_timer_needs_of_probing(void) { struct device_node *dn; bool needs_probing = false; unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM; /* We have two timers, and both device-tree nodes are probed. */ if ((arch_timers_present & mask) == mask) return false; /* * Only one type of timer is probed, * check if we have another type of timer node in device-tree. */ if (arch_timers_present & ARCH_TIMER_TYPE_CP15) dn = of_find_matching_node(NULL, arch_timer_mem_of_match); else dn = of_find_matching_node(NULL, arch_timer_of_match); if (dn && of_device_is_available(dn)) needs_probing = true; of_node_put(dn); return needs_probing; } static int __init arch_timer_common_init(void) { arch_timer_banner(arch_timers_present); arch_counter_register(arch_timers_present); return arch_timer_arch_init(); } /** * arch_timer_select_ppi() - Select suitable PPI for the current system. * * If HYP mode is available, we know that the physical timer * has been configured to be accessible from PL1. Use it, so * that a guest can use the virtual timer instead. * * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE * accesses to CNTP_*_EL1 registers are silently redirected to * their CNTHP_*_EL2 counterparts, and use a different PPI * number. * * If no interrupt provided for virtual timer, we'll have to * stick to the physical timer. It'd better be accessible... * For arm64 we never use the secure interrupt. * * Return: a suitable PPI type for the current system. */ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void) { if (is_kernel_in_hyp_mode()) return ARCH_TIMER_HYP_PPI; if (!is_hyp_mode_available() && arch_timer_ppi[ARCH_TIMER_VIRT_PPI]) return ARCH_TIMER_VIRT_PPI; if (IS_ENABLED(CONFIG_ARM64)) return ARCH_TIMER_PHYS_NONSECURE_PPI; return ARCH_TIMER_PHYS_SECURE_PPI; } static void __init arch_timer_populate_kvm_info(void) { arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; if (is_kernel_in_hyp_mode()) arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]; } static int __init arch_timer_of_init(struct device_node *np) { int i, irq, ret; u32 rate; bool has_names; if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { pr_warn("multiple nodes in dt, skipping\n"); return 0; } arch_timers_present |= ARCH_TIMER_TYPE_CP15; has_names = of_property_present(np, "interrupt-names"); for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) { if (has_names) irq = of_irq_get_byname(np, arch_timer_ppi_names[i]); else irq = of_irq_get(np, i); if (irq > 0) arch_timer_ppi[i] = irq; } arch_timer_populate_kvm_info(); rate = arch_timer_get_cntfrq(); arch_timer_of_configure_rate(rate, np); arch_timer_c3stop = !of_property_read_bool(np, "always-on"); /* Check for globally applicable workarounds */ arch_timer_check_ool_workaround(ate_match_dt, np); /* * If we cannot rely on firmware initializing the timer registers then * we should use the physical timers instead. */ if (IS_ENABLED(CONFIG_ARM) && of_property_read_bool(np, "arm,cpu-registers-not-fw-configured")) arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI; else arch_timer_uses_ppi = arch_timer_select_ppi(); if (!arch_timer_ppi[arch_timer_uses_ppi]) { pr_err("No interrupt available, giving up\n"); return -EINVAL; } /* On some systems, the counter stops ticking when in suspend. */ arch_counter_suspend_stop = of_property_read_bool(np, "arm,no-tick-in-suspend"); ret = arch_timer_register(); if (ret) return ret; if (arch_timer_needs_of_probing()) return 0; return arch_timer_common_init(); } TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); static u32 __init arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) { void __iomem *base; u32 rate; base = ioremap(frame->cntbase, frame->size); if (!base) { pr_err("Unable to map frame @ %pa\n", &frame->cntbase); return 0; } rate = readl_relaxed(base + CNTFRQ); iounmap(base); return rate; } static struct arch_timer_mem_frame * __init arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem) { struct arch_timer_mem_frame *frame, *best_frame = NULL; void __iomem *cntctlbase; u32 cnttidr; int i; cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size); if (!cntctlbase) { pr_err("Can't map CNTCTLBase @ %pa\n", &timer_mem->cntctlbase); return NULL; } cnttidr = readl_relaxed(cntctlbase + CNTTIDR); /* * Try to find a virtual capable frame. Otherwise fall back to a * physical capable frame. */ for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT | CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT; frame = &timer_mem->frame[i]; if (!frame->valid) continue; /* Try enabling everything, and see what sticks */ writel_relaxed(cntacr, cntctlbase + CNTACR(i)); cntacr = readl_relaxed(cntctlbase + CNTACR(i)); if ((cnttidr & CNTTIDR_VIRT(i)) && !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) { best_frame = frame; arch_timer_mem_use_virtual = true; break; } if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT)) continue; best_frame = frame; } iounmap(cntctlbase); return best_frame; } static int __init arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame) { void __iomem *base; int ret, irq; if (arch_timer_mem_use_virtual) irq = frame->virt_irq; else irq = frame->phys_irq; if (!irq) { pr_err("Frame missing %s irq.\n", arch_timer_mem_use_virtual ? "virt" : "phys"); return -EINVAL; } if (!request_mem_region(frame->cntbase, frame->size, "arch_mem_timer")) return -EBUSY; base = ioremap(frame->cntbase, frame->size); if (!base) { pr_err("Can't map frame's registers\n"); return -ENXIO; } ret = arch_timer_mem_register(base, irq); if (ret) { iounmap(base); return ret; } arch_timers_present |= ARCH_TIMER_TYPE_MEM; return 0; } static int __init arch_timer_mem_of_init(struct device_node *np) { struct arch_timer_mem *timer_mem; struct arch_timer_mem_frame *frame; struct resource res; int ret = -EINVAL; u32 rate; timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL); if (!timer_mem) return -ENOMEM; if (of_address_to_resource(np, 0, &res)) goto out; timer_mem->cntctlbase = res.start; timer_mem->size = resource_size(&res); for_each_available_child_of_node_scoped(np, frame_node) { u32 n; struct arch_timer_mem_frame *frame; if (of_property_read_u32(frame_node, "frame-number", &n)) { pr_err(FW_BUG "Missing frame-number.\n"); goto out; } if (n >= ARCH_TIMER_MEM_MAX_FRAMES) { pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n", ARCH_TIMER_MEM_MAX_FRAMES - 1); goto out; } frame = &timer_mem->frame[n]; if (frame->valid) { pr_err(FW_BUG "Duplicated frame-number.\n"); goto out; } if (of_address_to_resource(frame_node, 0, &res)) goto out; frame->cntbase = res.start; frame->size = resource_size(&res); frame->virt_irq = irq_of_parse_and_map(frame_node, ARCH_TIMER_VIRT_SPI); frame->phys_irq = irq_of_parse_and_map(frame_node, ARCH_TIMER_PHYS_SPI); frame->valid = true; } frame = arch_timer_mem_find_best_frame(timer_mem); if (!frame) { pr_err("Unable to find a suitable frame in timer @ %pa\n", &timer_mem->cntctlbase); ret = -EINVAL; goto out; } rate = arch_timer_mem_frame_get_cntfrq(frame); arch_timer_of_configure_rate(rate, np); ret = arch_timer_mem_frame_register(frame); if (!ret && !arch_timer_needs_of_probing()) ret = arch_timer_common_init(); out: kfree(timer_mem); return ret; } TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_of_init); #ifdef CONFIG_ACPI_GTDT static int __init arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem) { struct arch_timer_mem_frame *frame; u32 rate; int i; for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) { frame = &timer_mem->frame[i]; if (!frame->valid) continue; rate = arch_timer_mem_frame_get_cntfrq(frame); if (rate == arch_timer_rate) continue; pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n", &frame->cntbase, (unsigned long)rate, (unsigned long)arch_timer_rate); return -EINVAL; } return 0; } static int __init arch_timer_mem_acpi_init(int platform_timer_count) { struct arch_timer_mem *timers, *timer; struct arch_timer_mem_frame *frame, *best_frame = NULL; int timer_count, i, ret = 0; timers = kcalloc(platform_timer_count, sizeof(*timers), GFP_KERNEL); if (!timers) return -ENOMEM; ret = acpi_arch_timer_mem_init(timers, &timer_count); if (ret || !timer_count) goto out; /* * While unlikely, it's theoretically possible that none of the frames * in a timer expose the combination of feature we want. */ for (i = 0; i < timer_count; i++) { timer = &timers[i]; frame = arch_timer_mem_find_best_frame(timer); if (!best_frame) best_frame = frame; ret = arch_timer_mem_verify_cntfrq(timer); if (ret) { pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n"); goto out; } if (!best_frame) /* implies !frame */ /* * Only complain about missing suitable frames if we * haven't already found one in a previous iteration. */ pr_err("Unable to find a suitable frame in timer @ %pa\n", &timer->cntctlbase); } if (best_frame) ret = arch_timer_mem_frame_register(best_frame); out: kfree(timers); return ret; } /* Initialize per-processor generic timer and memory-mapped timer(if present) */ static int __init arch_timer_acpi_init(struct acpi_table_header *table) { int ret, platform_timer_count; if (arch_timers_present & ARCH_TIMER_TYPE_CP15) { pr_warn("already initialized, skipping\n"); return -EINVAL; } arch_timers_present |= ARCH_TIMER_TYPE_CP15; ret = acpi_gtdt_init(table, &platform_timer_count); if (ret) return ret; arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_PHYS_NONSECURE_PPI); arch_timer_ppi[ARCH_TIMER_VIRT_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_VIRT_PPI); arch_timer_ppi[ARCH_TIMER_HYP_PPI] = acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI); arch_timer_populate_kvm_info(); /* * When probing via ACPI, we have no mechanism to override the sysreg * CNTFRQ value. This *must* be correct. */ arch_timer_rate = arch_timer_get_cntfrq(); ret = validate_timer_rate(); if (ret) { pr_err(FW_BUG "frequency not available.\n"); return ret; } arch_timer_uses_ppi = arch_timer_select_ppi(); if (!arch_timer_ppi[arch_timer_uses_ppi]) { pr_err("No interrupt available, giving up\n"); return -EINVAL; } /* Always-on capability */ arch_timer_c3stop = acpi_gtdt_c3stop(arch_timer_uses_ppi); /* Check for globally applicable workarounds */ arch_timer_check_ool_workaround(ate_match_acpi_oem_info, table); ret = arch_timer_register(); if (ret) return ret; if (platform_timer_count && arch_timer_mem_acpi_init(platform_timer_count)) pr_err("Failed to initialize memory-mapped timer.\n"); return arch_timer_common_init(); } TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); #endif int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts, enum clocksource_ids *cs_id) { struct arm_smccc_res hvc_res; u32 ptp_counter; ktime_t ktime; if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)) return -EOPNOTSUPP; if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ptp_counter = KVM_PTP_VIRT_COUNTER; else ptp_counter = KVM_PTP_PHYS_COUNTER; arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, ptp_counter, &hvc_res); if ((int)(hvc_res.a0) < 0) return -EOPNOTSUPP; ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1; *ts = ktime_to_timespec64(ktime); if (cycle) *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3; if (cs_id) *cs_id = CSID_ARM_ARCH_COUNTER; return 0; } EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);
306 307 307 308 288 290 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_MMAN_H__ #define __ASM_MMAN_H__ #include <uapi/asm/mman.h> #ifndef BUILD_VDSO #include <linux/compiler.h> #include <linux/fs.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/types.h> static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { unsigned long ret = 0; if (system_supports_bti() && (prot & PROT_BTI)) ret |= VM_ARM64_BTI; if (system_supports_mte() && (prot & PROT_MTE)) ret |= VM_MTE; #ifdef CONFIG_ARCH_HAS_PKEYS if (system_supports_poe()) { ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0; ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0; ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0; } #endif return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags) { /* * Only allow MTE on anonymous mappings as these are guaranteed to be * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ if (system_supports_mte()) { if (flags & (MAP_ANONYMOUS | MAP_HUGETLB)) return VM_MTE_ALLOWED; if (shmem_file(file) || is_file_hugepages(file)) return VM_MTE_ALLOWED; } return 0; } #define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags) static inline bool arch_validate_prot(unsigned long prot, unsigned long addr __always_unused) { unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM; if (system_supports_bti()) supported |= PROT_BTI; if (system_supports_mte()) supported |= PROT_MTE; return (prot & ~supported) == 0; } #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) static inline bool arch_validate_flags(unsigned long vm_flags) { if (system_supports_mte()) { /* * only allow VM_MTE if VM_MTE_ALLOWED has been set * previously */ if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED)) return false; } if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { /* An executable GCS isn't a good idea. */ if (vm_flags & VM_EXEC) return false; /* The memory management core should prevent this */ VM_WARN_ON(vm_flags & VM_SHARED); } return true; } #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) #endif /* !BUILD_VDSO */ #endif /* ! __ASM_MMAN_H__ */
258 441 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> #include <linux/cleanup.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline void jump_label_init_ro(void) { } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
437 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_MEMBLOCK_H #define _LINUX_MEMBLOCK_H /* * Logical memory blocks. * * Copyright (C) 2001 Peter Bergner, IBM Corp. */ #include <linux/init.h> #include <linux/mm.h> #include <asm/dma.h> extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; /* * highest page */ extern unsigned long max_pfn; /* * highest possible page */ extern unsigned long long max_possible_pfn; /** * enum memblock_flags - definition of memory region attributes * @MEMBLOCK_NONE: no special request * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory * map during early boot as hot(un)pluggable system RAM (e.g., memory range * that might get hotunplugged later). With "movable_node" set on the kernel * commandline, try keeping this memory region hotunpluggable. Does not apply * to memblocks added ("hotplugged") after early boot. * @MEMBLOCK_MIRROR: mirrored region * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description * for further details * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added * via a driver, and never indicated in the firmware-provided memory map as * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the * kernel resource tree. * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are * not initialized (only for reserved regions). * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, * either explictitly with memblock_reserve_kern() or via memblock * allocation APIs. All memblock allocations set this flag. * @MEMBLOCK_KHO_SCRATCH: memory region that kexec can pass to the next * kernel in handover mode. During early boot, we do not know about all * memory reservations yet, so we get scratch memory from the previous * kernel that we know is good to use. It is the only memory that * allocations may happen from in this phase. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */ MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */ }; /** * struct memblock_region - represents a memory region * @base: base address of the region * @size: size of the region * @flags: memory region attributes * @nid: NUMA node id */ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct memblock_type - collection of memory regions of certain type * @cnt: number of regions * @max: size of the allocated array * @total_size: size of all regions * @regions: array of regions * @name: the memory type symbolic name */ struct memblock_type { unsigned long cnt; unsigned long max; phys_addr_t total_size; struct memblock_region *regions; char *name; }; /** * struct memblock - memblock allocator metadata * @bottom_up: is bottom up direction? * @current_limit: physical address of the current allocation limit * @memory: usable memory regions * @reserved: reserved memory regions */ struct memblock { bool bottom_up; /* is bottom up direction? */ phys_addr_t current_limit; struct memblock_type memory; struct memblock_type reserved; }; extern struct memblock memblock; #ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock static inline void memblock_discard(void) {} #endif void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); int __memblock_reserve(phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags); static __always_inline int memblock_reserve(phys_addr_t base, phys_addr_t size) { return __memblock_reserve(base, size, NUMA_NO_NODE, 0); } static __always_inline int memblock_reserve_kern(phys_addr_t base, phys_addr_t size) { return __memblock_reserve(base, size, NUMA_NO_NODE, MEMBLOCK_RSRV_KERN); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif void memblock_trim_memory(phys_addr_t align); unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, phys_addr_t size2); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); bool memblock_validate_numa_coverage(unsigned long threshold_bytes); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size); int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); /* Low level functions */ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); void memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, phys_addr_t *out_start, phys_addr_t *out_end) { extern struct memblock_type physmem; __next_mem_range(idx, NUMA_NO_NODE, MEMBLOCK_NONE, &physmem, type, out_start, out_end, NULL); } /** * for_each_physmem_range - iterate through physmem areas not included in type. * @i: u64 used as loop variable * @type: ptr to memblock_type which excludes from the iteration, can be %NULL * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL */ #define for_each_physmem_range(i, type, p_start, p_end) \ for (i = 0, __next_physmem_range(&i, type, p_start, p_end); \ i != (u64)ULLONG_MAX; \ __next_physmem_range(&i, type, p_start, p_end)) #endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */ /** * __for_each_mem_range - iterate through memblock areas from type_a and not * included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ #define __for_each_mem_range(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ __next_mem_range(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** * __for_each_mem_range_rev - reverse iterate through memblock areas from * type_a and not included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @type_a: ptr to memblock_type to iterate * @type_b: ptr to memblock_type which excludes from the iteration * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL */ #define __for_each_mem_range_rev(i, type_a, type_b, nid, flags, \ p_start, p_end, p_nid) \ for (i = (u64)ULLONG_MAX, \ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid); \ i != (u64)ULLONG_MAX; \ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ p_start, p_end, p_nid)) /** * for_each_mem_range - iterate through memory areas. * @i: u64 used as loop variable * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \ p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from * type_a and not included in type_b. Or just type_a if type_b is NULL. * @i: u64 used as loop variable * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\ p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas * @i: u64 used as loop variable * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * * Walks over reserved areas of memblock. Available as soon as memblock * is initialized. */ #define for_each_reserved_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE, \ MEMBLOCK_NONE, p_start, p_end, NULL) static inline bool memblock_is_hotpluggable(struct memblock_region *m) { return m->flags & MEMBLOCK_HOTPLUG; } static inline bool memblock_is_mirror(struct memblock_region *m) { return m->flags & MEMBLOCK_MIRROR; } static inline bool memblock_is_nomap(struct memblock_region *m) { return m->flags & MEMBLOCK_NOMAP; } static inline bool memblock_is_reserved_noinit(struct memblock_region *m) { return m->flags & MEMBLOCK_RSRV_NOINIT; } static inline bool memblock_is_driver_managed(struct memblock_region *m) { return m->flags & MEMBLOCK_DRIVER_MANAGED; } static inline bool memblock_is_kho_scratch(struct memblock_region *m) { return m->flags & MEMBLOCK_KHO_SCRATCH; } int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); /** * for_each_mem_pfn_range - early memory pfn range iterator * @i: an integer used as loop variable * @nid: node selector, %MAX_NUMNODES for all nodes * @p_start: ptr to ulong for start pfn of the range, can be %NULL * @p_end: ptr to ulong for end pfn of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL * * Walks over configured memory ranges. */ #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, unsigned long *out_spfn, unsigned long *out_epfn); /** * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific * free memblock areas from a given point * @i: u64 used as loop variable * @zone: zone in which all of the memory blocks reside * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * * Walks over free (memory && !reserved) areas of memblock in a specific * zone, continuing from current position. Available as soon as memblock is * initialized. */ #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL * * Walks over free (memory && !reserved) areas of memblock. Available as * soon as memblock is initialized. */ #define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \ __for_each_mem_range(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) /** * for_each_free_mem_range_reverse - rev-iterate through free memblock areas * @i: u64 used as loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL * @p_nid: ptr to int for nid of the range, can be %NULL * * Walks over free (memory && !reserved) areas of memblock in reverse * order. Available as soon as memblock is initialized. */ #define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \ p_nid) \ __for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); #ifdef CONFIG_NUMA static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; } static inline int memblock_get_region_node(const struct memblock_region *r) { return r->nid; } #else static inline void memblock_set_region_node(struct memblock_region *r, int nid) { } static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } #endif /* CONFIG_NUMA */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 /* * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies * MEMBLOCK_ALLOC_ACCESSIBLE */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ #define MEMBLOCK_LOW_LIMIT 0 #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end); phys_addr_t memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, bool exact_nid); phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size, phys_addr_t align) { return memblock_phys_alloc_range(size, align, 0, MEMBLOCK_ALLOC_ACCESSIBLE); } void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) { return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, const char *func); #define memblock_alloc_or_panic(size, align) \ __memblock_alloc_or_panic(size, align, __func__) static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } static inline void *memblock_alloc_from(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr) { return memblock_alloc_try_nid(size, align, min_addr, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } static inline void *memblock_alloc_low(phys_addr_t size, phys_addr_t align) { return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE); } static inline void *memblock_alloc_node(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } /* * Set the allocation direction to bottom-up or top-down. */ static inline __init_memblock void memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } /* * Check if the allocation direction is bottom-up or not. * if this is true, that said, memblock will allocate memory * in bottom-up direction. */ static inline __init_memblock bool memblock_bottom_up(void) { return memblock.bottom_up; } phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid); unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size); void memblock_mem_limit_remove_map(phys_addr_t limit); bool memblock_is_memory(phys_addr_t addr); bool memblock_is_map_memory(phys_addr_t addr); bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size); bool memblock_is_reserved(phys_addr_t addr); bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); void memblock_dump_all(void); /** * memblock_set_current_limit - Set the current allocation limit to allow * limiting allocations to what is currently * accessible during boot * @limit: New limit value (physical address) */ void memblock_set_current_limit(phys_addr_t limit); phys_addr_t memblock_get_current_limit(void); /* * pfn conversion functions * * While the memory MEMBLOCKs should always be page aligned, the reserved * MEMBLOCKs may not be. This accessor attempt to provide a very clear * idea of what they return for such non aligned MEMBLOCKs. */ /** * memblock_region_memory_base_pfn - get the lowest pfn of the memory region * @reg: memblock_region structure * * Return: the lowest pfn intersecting with the memory region */ static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg) { return PFN_UP(reg->base); } /** * memblock_region_memory_end_pfn - get the end pfn of the memory region * @reg: memblock_region structure * * Return: the end_pfn of the reserved region */ static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg) { return PFN_DOWN(reg->base + reg->size); } /** * memblock_region_reserved_base_pfn - get the lowest pfn of the reserved region * @reg: memblock_region structure * * Return: the lowest pfn intersecting with the reserved region */ static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg) { return PFN_DOWN(reg->base); } /** * memblock_region_reserved_end_pfn - get the end pfn of the reserved region * @reg: memblock_region structure * * Return: the end_pfn of the reserved region */ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg) { return PFN_UP(reg->base + reg->size); } /** * for_each_mem_region - iterate over memory regions * @region: loop variable */ #define for_each_mem_region(region) \ for (region = memblock.memory.regions; \ region < (memblock.memory.regions + memblock.memory.cnt); \ region++) /** * for_each_reserved_mem_region - itereate over reserved memory regions * @region: loop variable */ #define for_each_reserved_mem_region(region) \ for (region = memblock.reserved.regions; \ region < (memblock.reserved.regions + memblock.reserved.cnt); \ region++) extern void *alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, unsigned long high_limit); #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_ZERO 0x00000002 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. */ #ifdef CONFIG_NUMA #define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT) extern int hashdist; /* Distribute hashes across NUMA nodes? */ #else #define hashdist (0) #endif #ifdef CONFIG_MEMTEST void early_memtest(phys_addr_t start, phys_addr_t end); void memtest_report_meminfo(struct seq_file *m); #else static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } static inline void memtest_report_meminfo(struct seq_file *m) { } #endif #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH void memblock_set_kho_scratch_only(void); void memblock_clear_kho_scratch_only(void); void memmap_init_kho_scratch_pages(void); #else static inline void memblock_set_kho_scratch_only(void) { } static inline void memblock_clear_kho_scratch_only(void) { } static inline void memmap_init_kho_scratch_pages(void) {} #endif #endif /* _LINUX_MEMBLOCK_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ /* Devmaps primary use is as a backend map for XDP BPF helper call * bpf_redirect_map(). Because XDP is mostly concerned with performance we * spent some effort to ensure the datapath with redirect maps does not use * any locking. This is a quick note on the details. * * We have three possible paths to get into the devmap control plane bpf * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall * will invoke an update, delete, or lookup operation. To ensure updates and * deletes appear atomic from the datapath side xchg() is used to modify the * netdev_map array. Then because the datapath does a lookup into the netdev_map * array (read-only) from an RCU critical section we use call_rcu() to wait for * an rcu grace period before free'ing the old data structures. This ensures the * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. * * Finally, any of the above may race with a netdev_unregister notifier. The * unregister notifier must search for net devices in the map structure that * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. * When removing the dev a cmpxchg() is used to ensure the correct dev is * removed, in the case of a concurrent update or delete operation it is * possible that the initially referenced dev is no longer in the map. As the * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. * * The devmap_hash type is a map type which interprets keys as ifindexes and * indexes these using a hashmap. This allows maps that use ifindex as key to be * densely packed instead of having holes in the lookup array for unused * ifindexes. The setup and packet enqueue/send code is shared between the two * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; spinlock_t index_lock; unsigned int items; u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct hlist_head *dev_map_create_hash(unsigned int entries, int numa_node) { int i; struct hlist_head *hash; hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, int idx) { return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } static int dev_map_alloc_check(union bpf_attr *attr) { u32 valsize = attr->value_size; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex * 8 bytes: ifindex + prog fd */ if (attr->max_entries == 0 || attr->key_size != 4 || (valsize != offsetofend(struct bpf_devmap_val, ifindex) && valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2; roundup_pow_of_two() * can overflow into UB on 32-bit arches */ if (attr->max_entries > 1UL << 31) return -EINVAL; } return 0; } static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2 */ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) return -ENOMEM; } return 0; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err; dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { bpf_map_area_free(dtab); return ERR_PTR(err); } spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; } static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. The following synchronize_rcu() guarantees * both rcu read critical sections complete and waits for * preempt-disable regions (NAPI being the relevant context here) so we * are certain there will be no further reads against the netdev_map and * all flush operations are complete. Flush operations can only be done * from NAPI context for this reason. */ spin_lock(&dev_map_lock); list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() * during NAPI callback and cleared after the XDP redirect. There is no * explicit RCU read section which protects bpf_redirect_info->map but * local_bh_disable() also marks the beginning an RCU section. This * makes the complete softirq callback RCU protected. Thus after * following synchronize_rcu() there no bpf_redirect_info->map == map * assignment. */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } bpf_map_area_free(dtab->netdev_map); } bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; return 0; } if (index == dtab->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; hlist_for_each_entry_rcu(dev, head, index_hlist, lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; return NULL; } static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 idx, *next = next_key; struct bpf_dtab_netdev *dev, *next_dev; struct hlist_head *head; int i = 0; if (!key) goto find_first; idx = *(u32 *)key; dev = __dev_map_hash_lookup_elem(map, idx); if (!dev) goto find_first; next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } i = idx & (dtab->n_buckets - 1); i++; find_first: for (; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } } return -ENOENT; } static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *tx_dev, struct net_device *rx_dev) { struct xdp_txq_info txq = { .dev = tx_dev }; struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (unlikely(err < 0)) xdp_return_frame_rx_napi(xdpf); else frames[nframes++] = xdpf; break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); break; } } return nframes; /* sent frames count */ } static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; int sent = 0, err = 0; int to_send = cnt; int i; if (unlikely(!cnt)) return; for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } if (bq->xdp_prog) { to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. */ err = sent; sent = 0; } /* If not all frames have been transmitted, it is our * responsibility to free them */ for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); out: bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ void __dev_flush(struct list_head *flush_list) { struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); } } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; obj = rcu_dereference_check(dtab->netdev_map[key], rcu_read_lock_bh_held()); return obj; } /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. * * Do the same with xdp_prog and flush_list since these fields * are only ever modified together. */ if (!bq->dev_rx) { struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); } bq->q[bq->count++] = xdpf; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { int err; if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return -EOPNOTSUPP; if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return -EOPNOTSUPP; err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) { struct xdp_txq_info txq = { .dev = dst->dev }; struct xdp_buff xdp; u32 act; if (!dst->xdp_prog) return XDP_PASS; __skb_pull(skb, skb->mac_len); xdp.txq = &txq; act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); switch (act) { case XDP_PASS: __skb_push(skb, skb->mac_len); break; default: bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); fallthrough; case XDP_DROP: kfree_skb(skb); break; } return act; } int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj) return false; if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return false; if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return false; if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; } static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, struct net_device *dev_rx, struct xdp_frame *xdpf) { struct xdp_frame *nxdpf; nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); return 0; } static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) { while (num_excluded--) { if (ifindex == excluded[num_excluded]) return true; } return false; } /* Get ifindex of each upper device. 'indexes' must be able to hold at * least MAX_NEST_DEV elements. * Returns the number of ifindexes added. */ static int get_upper_ifindexes(struct net_device *dev, int *indexes) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { indexes[n++] = upper->ifindex; } return n; } int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); excluded_devices[num_excluded++] = dev_rx->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } } /* consume the last copy of the frame */ if (last_dst) bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); else xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { int err; err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. */ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) return 0; skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; err = dev_map_generic_redirect(dst, nskb, xdp_prog); if (unlikely(err)) { consume_skb(nskb); return err; } return 0; } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev, excluded_devices); excluded_devices[num_excluded++] = dev->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } } /* consume the first skb and return */ if (last_dst) return dev_map_generic_redirect(last_dst, skb, xdp_prog); /* dtab is empty */ consume_skb(skb); return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } return 0; } static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, k); if (old_dev) { dtab->items--; hlist_del_init_rcu(&old_dev->index_hlist); call_rcu(&old_dev->rcu, __dev_map_entry_free); ret = 0; } spin_unlock_irqrestore(&dtab->index_lock, flags); return ret; } static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, struct bpf_devmap_val *val, unsigned int idx) { struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; if (prog->expected_attach_type != BPF_XDP_DEVMAP || !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } dev->idx = idx; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; } else { dev->xdp_prog = NULL; dev->val.bpf_prog.id = 0; } dev->val.ifindex = val->ifindex; return dev; err_put_prog: bpf_prog_put(prog); err_put_dev: dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); } static long __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the * net device is removed. */ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); else atomic_inc((atomic_t *)&dtab->items); return 0; } static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); } else { if (dtab->items >= dtab->map.max_entries) { spin_unlock_irqrestore(&dtab->index_lock, flags); call_rcu(&dev->rcu, __dev_map_entry_free); return -E2BIG; } dtab->items++; } hlist_add_head_rcu(&dev->index_hlist, dev_map_index_hash(dtab, idx)); spin_unlock_irqrestore(&dtab->index_lock, flags); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; out_err: spin_unlock_irqrestore(&dtab->index_lock, flags); return err; } static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_hash_lookup_elem); } static u64 dev_map_mem_usage(const struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u64 usage = sizeof(struct bpf_dtab); if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); else usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); usage += atomic_read((atomic_t *)&dtab->items) * (u64)sizeof(struct bpf_dtab_netdev); return usage; } BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, struct net_device *netdev) { unsigned long flags; u32 i; spin_lock_irqsave(&dtab->index_lock, flags); for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { if (netdev != dev->dev) continue; dtab->items--; hlist_del_rcu(&dev->index_hlist); call_rcu(&dev->rcu, __dev_map_entry_free); } } spin_unlock_irqrestore(&dtab->index_lock, flags); } static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; int i, cpu; switch (event) { case NETDEV_REGISTER: if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) break; /* will be freed in free_netdev() */ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; for_each_possible_cpu(cpu) per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete * operation does not free a netdev_map entry while we * are comparing it against the netdev being unregistered. */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dev_map_hash_remove_netdev(dtab, netdev); continue; } for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } } } rcu_read_unlock(); break; default: break; } return NOTIFY_OK; } static struct notifier_block dev_map_notifier = { .notifier_call = dev_map_notification, }; static int __init dev_map_init(void) { /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } subsys_initcall(dev_map_init);
26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/setup.c * * Copyright (C) 1995-2001 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/initrd.h> #include <linux/console.h> #include <linux/cache.h> #include <linux/screen_info.h> #include <linux/init.h> #include <linux/kexec.h> #include <linux/root_dev.h> #include <linux/cpu.h> #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/fs.h> #include <linux/panic_notifier.h> #include <linux/proc_fs.h> #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/scs.h> #include <linux/mm.h> #include <asm/acpi.h> #include <asm/fixmap.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/daifflags.h> #include <asm/elf.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> #include <asm/kasan.h> #include <asm/numa.h> #include <asm/rsi.h> #include <asm/scs.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/smp_plat.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> #include <asm/efi.h> #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h> static int num_standard_resources; static struct resource *standard_resources; phys_addr_t __fdt_pointer __initdata; u64 mmu_enabled_at_boot __initdata; /* * Standard memory resources */ static struct resource mem_res[] = { { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM }, { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_SYSTEM_RAM } }; #define kernel_code mem_res[0] #define kernel_data mem_res[1] /* * The recorded values of x0 .. x3 upon kernel entry. */ u64 __cacheline_aligned boot_args[4]; void __init smp_setup_processor_id(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; set_cpu_logical_map(0, mpidr); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { return phys_id == cpu_logical_map(cpu); } struct mpidr_hash mpidr_hash; /** * smp_build_mpidr_hash - Pre-compute shifts required at each affinity * level in order to build a linear index from an * MPIDR value. Resulting algorithm is a collision * free hash carried out through shifting and ORing */ static void __init smp_build_mpidr_hash(void) { u32 i, affinity, fs[4], bits[4], ls; u64 mask = 0; /* * Pre-scan the list of MPIDRS and filter out bits that do * not contribute to affinity levels, ie they never toggle. */ for_each_possible_cpu(i) mask |= (cpu_logical_map(i) ^ cpu_logical_map(0)); pr_debug("mask of set bits %#llx\n", mask); /* * Find and stash the last and first bit set at all affinity levels to * check how many bits are required to represent them. */ for (i = 0; i < 4; i++) { affinity = MPIDR_AFFINITY_LEVEL(mask, i); /* * Find the MSB bit and LSB bits position * to determine how many bits are required * to express the affinity level. */ ls = fls(affinity); fs[i] = affinity ? ffs(affinity) - 1 : 0; bits[i] = ls - fs[i]; } /* * An index can be created from the MPIDR_EL1 by isolating the * significant bits at each affinity level and by shifting * them in order to compress the 32 bits values space to a * compressed set of values. This is equivalent to hashing * the MPIDR_EL1 through shifting and ORing. It is a collision free * hash though not minimal since some levels might contain a number * of CPUs that is not an exact power of 2 and their bit * representation might contain holes, eg MPIDR_EL1[7:0] = {0x2, 0x80}. */ mpidr_hash.shift_aff[0] = MPIDR_LEVEL_SHIFT(0) + fs[0]; mpidr_hash.shift_aff[1] = MPIDR_LEVEL_SHIFT(1) + fs[1] - bits[0]; mpidr_hash.shift_aff[2] = MPIDR_LEVEL_SHIFT(2) + fs[2] - (bits[1] + bits[0]); mpidr_hash.shift_aff[3] = MPIDR_LEVEL_SHIFT(3) + fs[3] - (bits[2] + bits[1] + bits[0]); mpidr_hash.mask = mask; mpidr_hash.bits = bits[3] + bits[2] + bits[1] + bits[0]; pr_debug("MPIDR hash: aff0[%u] aff1[%u] aff2[%u] aff3[%u] mask[%#llx] bits[%u]\n", mpidr_hash.shift_aff[0], mpidr_hash.shift_aff[1], mpidr_hash.shift_aff[2], mpidr_hash.shift_aff[3], mpidr_hash.mask, mpidr_hash.bits); /* * 4x is an arbitrary value used to warn on a hash table much bigger * than expected on most systems. */ if (mpidr_hash_size() > 4 * num_possible_cpus()) pr_warn("Large number of MPIDR hash buckets detected\n"); } static void __init setup_machine_fdt(phys_addr_t dt_phys) { int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; if (dt_virt) memblock_reserve(dt_phys, size); /* * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used. * Pass dt_phys directly. */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" "\nPlease check your bootloader.\n", &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() * or oops, so the least terrible thing to do is cpu_relax(), * or else we could end-up printing non-initialized data, etc. */ while (true) cpu_relax(); } /* Early fixups are done, map the FDT as read-only now */ fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); name = of_flat_dt_get_machine_name(); if (!name) return; pr_info("Machine model: %s\n", name); dump_stack_set_arch_desc("%s (DT)", name); } static void __init request_standard_resources(void) { struct memblock_region *region; struct resource *res; unsigned long i = 0; size_t res_size; kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); insert_resource(&iomem_resource, &kernel_code); insert_resource(&iomem_resource, &kernel_data); num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; if (memblock_is_nomap(region)) { res->name = "reserved"; res->flags = IORESOURCE_MEM; res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; } else { res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1; } insert_resource(&iomem_resource, res); } } static int __init reserve_memblock_reserved_regions(void) { u64 i, j; for (i = 0; i < num_standard_resources; ++i) { struct resource *mem = &standard_resources[i]; phys_addr_t r_start, r_end, mem_size = resource_size(mem); if (!memblock_is_region_reserved(mem->start, mem_size)) continue; for_each_reserved_mem_range(j, &r_start, &r_end) { resource_size_t start, end; start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start); end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end); if (start > mem->end || end < mem->start) continue; reserve_region_with_split(mem, start, end, "reserved"); } } return 0; } arch_initcall(reserve_memblock_reserved_regions); u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; u64 cpu_logical_map(unsigned int cpu) { return __cpu_logical_map[cpu]; } void __init __no_sanitize_address setup_arch(char **cmdline_p) { setup_initial_init_mm(_stext, _etext, _edata, _end); *cmdline_p = boot_command_line; kaslr_init(); early_fixmap_init(); early_ioremap_init(); setup_machine_fdt(__fdt_pointer); /* * Initialise the static keys early as they may be enabled by the * cpufeature code and early parameters. */ jump_label_init(); parse_early_param(); dynamic_scs_init(); /* * The primary CPU enters the kernel with all DAIF exceptions masked. * * We must unmask Debug and SError before preemption or scheduling is * possible to ensure that these are consistently unmasked across * threads, and we want to unmask SError as soon as possible after * initializing earlycon so that we can report any SErrors immediately. * * IRQ and FIQ will be unmasked after the root irqchip has been * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ cpu_uninstall_idmap(); xen_early_init(); efi_init(); if (!efi_enabled(EFI_BOOT)) { if ((u64)_text % MIN_KIMG_ALIGN) pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!"); WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND, FW_BUG "Booted with MMU enabled!"); } arm64_memblock_init(); paging_init(); acpi_table_upgrade(); /* Parse the ACPI tables for possible boot-time configuration */ acpi_boot_table_init(); if (acpi_disabled) unflatten_device_tree(); bootmem_init(); kasan_init(); request_standard_resources(); early_ioremap_reset(); if (acpi_disabled) psci_dt_init(); else psci_acpi_init(); arm64_rsi_init(); init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation * faults in case uaccess_enable() is inadvertently called by the init * thread. */ init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); #endif if (boot_args[1] || boot_args[2] || boot_args[3]) { pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n" "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n" "This indicates a broken bootloader or old kernel\n", boot_args[1], boot_args[2], boot_args[3]); } } static inline bool cpu_can_disable(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU const struct cpu_operations *ops = get_cpu_ops(cpu); if (ops && ops->cpu_can_disable) return ops->cpu_can_disable(cpu); #endif return false; } bool arch_cpu_is_hotpluggable(int num) { return cpu_can_disable(num); } static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) { pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", offset, KIMAGE_VADDR); pr_emerg("PHYS_OFFSET: 0x%llx\n", PHYS_OFFSET); } else { pr_emerg("Kernel Offset: disabled\n"); } } static int arm64_panic_block_dump(struct notifier_block *self, unsigned long v, void *p) { dump_kernel_offset(); dump_cpu_features(); dump_mem_limit(); return 0; } static struct notifier_block arm64_panic_block = { .notifier_call = arm64_panic_block_dump }; static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, &arm64_panic_block); return 0; } device_initcall(register_arm64_panic_block); static int __init check_mmu_enabled_at_boot(void) { if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot) panic("Non-EFI boot detected with MMU and caches enabled"); return 0; } device_initcall_sync(check_mmu_enabled_at_boot);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <net/udp_tunnel.h> #include <net/sch_generic.h> #include <linux/netfilter.h> #include <rdma/ib_addr.h> #include "rxe.h" #include "rxe_net.h" #include "rxe_loc.h" static struct rxe_recv_sockets recv_sockets; static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; memset(&fl, 0, sizeof(fl)); fl.flowi4_oif = ndev->ifindex; memcpy(&fl.saddr, saddr, sizeof(*saddr)); memcpy(&fl.daddr, daddr, sizeof(*daddr)); fl.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } return &rt->dst; } #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { struct dst_entry *ndst; struct flowi6 fl6 = { { 0 } }; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = ndev->ifindex; memcpy(&fl6.saddr, saddr, sizeof(*saddr)); memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } return ndst; put: dst_release(ndst); return NULL; } #else static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { return NULL; } #endif static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { struct dst_entry *dst = NULL; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); if (!dst || !dst_check(dst, qp->dst_cookie)) { if (dst) dst_release(dst); if (av->network_type == RXE_NETWORK_TYPE_IPV4) { struct in_addr *saddr; struct in_addr *daddr; saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); #endif } if (dst && (qp_type(qp) == IB_QPT_RC)) { dst_hold(dst); sk_dst_set(qp->sk->sk, dst); } } return dst; } static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct udphdr *udph; struct rxe_dev *rxe; struct net_device *ndev = skb->dev; struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); /* takes a reference on rxe->ib_dev * drop when skb is freed */ rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (!rxe) goto drop; if (skb_linearize(skb)) { ib_device_put(&rxe->ib_dev); goto drop; } udph = udp_hdr(skb); pkt->rxe = rxe; pkt->port_num = 1; pkt->hdr = (u8 *)(udph + 1); pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; drop: kfree_skb(skb); return 0; } static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, bool ipv6) { int err; struct socket *sock; struct udp_port_cfg udp_cfg = { }; struct udp_tunnel_sock_cfg tnl_cfg = { }; if (ipv6) { udp_cfg.family = AF_INET6; udp_cfg.ipv6_v6only = 1; } else { udp_cfg.family = AF_INET; } udp_cfg.local_udp_port = port; /* Create UDP socket */ err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; /* Setup UDP tunnel */ setup_udp_tunnel_sock(net, sock, &tnl_cfg); return sock; } static void rxe_release_udp_tunnel(struct socket *sk) { if (sk) udp_tunnel_sock_release(sk); } static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, __be16 dst_port) { struct udphdr *udph; __skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->dest = dst_port; udph->source = src_port; udph->len = htons(skb->len); udph->check = 0; } static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, __be32 saddr, __be32 daddr, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { struct iphdr *iph; skb_scrub_packet(skb, xnet); skb_clear_hash(skb); skb_dst_set(skb, dst_clone(dst)); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = IPVERSION; iph->ihl = sizeof(struct iphdr) >> 2; iph->tot_len = htons(skb->len); iph->frag_off = df; iph->protocol = proto; iph->tos = tos; iph->daddr = daddr; iph->saddr = saddr; iph->ttl = ttl; __ip_select_ident(dev_net(dst->dev), iph, skb_shinfo(skb)->gso_segs ?: 1); } static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, __u8 proto, __u8 prio, __u8 ttl) { struct ipv6hdr *ip6h; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_set(skb, dst_clone(dst)); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, htonl(0)); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = proto; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; bool xnet = false; __be16 df = htons(IP_DF); struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); dst_release(dst); return 0; } static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit); dst_release(dst); return 0; } int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err = 0; if (skb->protocol == htons(ETH_P_IP)) err = prepare4(av, pkt, skb); else if (skb->protocol == htons(ETH_P_IPV6)) err = prepare6(av, pkt, skb); if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) pkt->mask |= RXE_LOOPBACK_MASK; return err; } static void rxe_skb_tx_dtor(struct sk_buff *skb) { struct net_device *ndev = skb->dev; struct rxe_dev *rxe; unsigned int qp_index; struct rxe_qp *qp; int skb_out; rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (WARN_ON(!rxe)) return; qp_index = (int)(uintptr_t)skb->sk->sk_user_data; if (!qp_index) return; qp = rxe_pool_get_index(&rxe->qp_pool, qp_index); if (!qp) goto put_dev; skb_out = atomic_dec_return(&qp->skb_out); if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW) rxe_sched_task(&qp->send_task); rxe_put(qp); put_dev: ib_device_put(&rxe->ib_dev); sock_put(skb->sk); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; struct sock *sk = pkt->qp->sk->sk; sock_hold(sk); skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); return err; } /* fix up a send packet to match the packets * received from UDP before looping them back */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { struct sock *sk = pkt->qp->sk->sk; memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); sock_hold(sk); skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else skb_pull(skb, sizeof(struct ipv6hdr)); if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) { kfree_skb(skb); return -EIO; } /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; } int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); if (pkt->mask & RXE_LOOPBACK_MASK) err = rxe_loopback(skb, pkt); else err = rxe_send(skb, pkt); if (err) { rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); return err; } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; drop: kfree_skb(skb); err = 0; done: return err; } struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { unsigned int hdr_len; struct sk_buff *skb = NULL; struct net_device *ndev; const struct ib_gid_attr *attr; const int port_num = 1; attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); if (IS_ERR(attr)) return NULL; if (av->network_type == RXE_NETWORK_TYPE_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct iphdr); else hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct ipv6hdr); rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (IS_ERR(ndev)) { rcu_read_unlock(); goto out; } skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); if (unlikely(!skb)) { rcu_read_unlock(); goto out; } skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); /* FIXME: hold reference to this netdev until life of this skb. */ skb->dev = ndev; rcu_read_unlock(); if (av->network_type == RXE_NETWORK_TYPE_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); pkt->rxe = rxe; pkt->port_num = port_num; pkt->hdr = skb_put(skb, paylen); pkt->mask |= RXE_GRH_MASK; out: rdma_put_gid_attr(attr); return skb; } /* * this is required by rxe_cfg to match rxe devices in * /sys/class/infiniband up with their underlying ethernet devices */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { struct net_device *ndev; char *ndev_name; ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); if (!ndev) return NULL; ndev_name = ndev->name; dev_put(ndev); return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) { int err; struct rxe_dev *rxe = NULL; rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) return -ENOMEM; ib_mark_name_assigned_by_user(&rxe->ib_dev); err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; } return 0; } static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { struct ib_event ev; ev.device = &rxe->ib_dev; ev.element.port_num = 1; ev.event = event; ib_dispatch_event(&ev); } /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); } void rxe_set_port_state(struct rxe_dev *rxe) { struct net_device *ndev; ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); if (!ndev) return; if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE) rxe_port_up(rxe); else rxe_port_down(rxe); dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, unsigned long event, void *arg) { struct net_device *ndev = netdev_notifier_info_to_dev(arg); struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); if (!rxe) return NOTIFY_OK; switch (event) { case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_DOWN: case NETDEV_CHANGE: if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN) rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } ib_device_put(&rxe->ib_dev); return NOTIFY_OK; } static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; static int rxe_net_ipv4_init(void) { recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false); if (IS_ERR(recv_sockets.sk4)) { recv_sockets.sk4 = NULL; pr_err("Failed to create IPv4 UDP tunnel\n"); return -1; } return 0; } static int rxe_net_ipv6_init(void) { #if IS_ENABLED(CONFIG_IPV6) recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true); if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { recv_sockets.sk6 = NULL; pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); return 0; } if (IS_ERR(recv_sockets.sk6)) { recv_sockets.sk6 = NULL; pr_err("Failed to create IPv6 UDP tunnel\n"); return -1; } #endif return 0; } void rxe_net_exit(void) { rxe_release_udp_tunnel(recv_sockets.sk6); rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } int rxe_net_init(void) { int err; recv_sockets.sk6 = NULL; err = rxe_net_ipv4_init(); if (err) return err; err = rxe_net_ipv6_init(); if (err) goto err_out; err = register_netdevice_notifier(&rxe_net_notifier); if (err) { pr_err("Failed to register netdev notifier\n"); goto err_out; } return 0; err_out: rxe_net_exit(); return err; }
6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 }, }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u32 max_io_eqs; int err; if (!port->ops->port_fn_max_io_eqs_get) return 0; err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { u32 max_io_eqs; max_io_eqs = nla_get_u32(attr); return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port, max_io_eqs, extack); } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); devlink_nl_obj_desc_port_set(&desc, devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] && !ops->port_fn_max_io_eqs_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS], "Function does not support max_io_eqs setting"); return -EOPNOTSUPP; } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS]; if (attr) { err = devlink_port_fn_max_io_eqs_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @vf: associated PCI VF number of a PF for the devlink port instance; * VF number starts from 0 for the first PCI virtual function * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PCI function number for the devlink port instance * @sf: associated SF number of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devl