18 13 5 2 2 2 5 5 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 1 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 18 1215 428 1201 336 1199 1214 1213 335 221 2131 912 1478 362 19 168 427 3 1215 724 285 251 484 482 225 627 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 | /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); } |
1 1 1 1 3 2 4 4 4 4 4 2 2 2 2 3 3 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com> * * Based off of net/unix/unix_bpf.c */ #include <linux/bpf.h> #include <linux/module.h> #include <linux/skmsg.h> #include <linux/socket.h> #include <linux/wait.h> #include <net/af_vsock.h> #include <net/sock.h> #define vsock_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&(__sk)->sk_receive_queue) || \ !skb_queue_empty(&(__psock)->ingress_skb) || \ !list_empty(&(__psock)->ingress_msg); \ }) static struct proto *vsock_prot_saved __read_mostly; static DEFINE_SPINLOCK(vsock_prot_lock); static struct proto vsock_bpf_prot; static bool vsock_has_data(struct sock *sk, struct sk_psock *psock) { struct vsock_sock *vsk = vsock_sk(sk); s64 ret; ret = vsock_connectible_has_data(vsk); if (ret > 0) return true; return vsock_sk_has_data(sk, psock); } static bool vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { bool ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk->sk_shutdown & RCV_SHUTDOWN) return true; if (!timeo) return false; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = vsock_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = vsock_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct socket *sock = sk->sk_socket; int err; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) err = __vsock_connectible_recvmsg(sock, msg, len, flags); else if (sk->sk_type == SOCK_DGRAM) err = __vsock_dgram_recvmsg(sock, msg, len, flags); else err = -EPROTOTYPE; return err; } static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; struct vsock_sock *vsk; int copied; psock = sk_psock_get(sk); if (unlikely(!psock)) return __vsock_recvmsg(sk, msg, len, flags); lock_sock(sk); vsk = vsock_sk(sk); if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); while (copied == 0) { long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!vsock_msg_wait_data(sk, psock, timeo)) { copied = -EAGAIN; break; } if (sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); } out: release_sock(sk); sk_psock_put(sk, psock); return copied; } static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = vsock_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void vsock_bpf_check_needs_rebuild(struct proto *ops) { /* Paired with the smp_store_release() below. */ if (unlikely(ops != smp_load_acquire(&vsock_prot_saved))) { spin_lock_bh(&vsock_prot_lock); if (likely(ops != vsock_prot_saved)) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, ops); /* Make sure proto function pointers are updated before publishing the * pointer to the struct. */ smp_store_release(&vsock_prot_saved, ops); } spin_unlock_bh(&vsock_prot_lock); } } int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct vsock_sock *vsk; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; if (!vsk->transport->read_skb) return -EOPNOTSUPP; vsock_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &vsock_bpf_prot); return 0; } void __init vsock_bpf_build_proto(void) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, &vsock_proto); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. * * jiffies and jiffies_64 are at the same address for little-endian systems * and for 64-bit big-endian systems. * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64 * (i.e., at address @jiffies_64 + 4). * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else /** * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value * * When BITS_PER_LONG < 64, this uses sequence number sampling using * jiffies_lock to protect the 64-bit read. * * Return: current 64-bit jiffies value */ static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /** * DOC: General information about time_* inlines * * These inlines deal with timer wrapping correctly. You are strongly encouraged * to use them: * * #. Because people otherwise forget * #. Because if the timer wrap changes in future you won't have to alter your * driver code. */ /** * time_after - returns true if the time a is after time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. * * Return: %true is time a is after time b, otherwise %false. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) /** * time_before - returns true if the time a is before time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before time b, otherwise %false. */ #define time_before(a,b) time_after(b,a) /** * time_after_eq - returns true if the time a is after or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) /** * time_before_eq - returns true if the time a is before or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq(a,b) time_after_eq(b,a) /** * time_in_range - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /** * time_in_range_open - Calculate whether a is in the range of [b, c). * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c), otherwise %false. */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). */ /** * time_after64 - returns true if the time a is after time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after time b, otherwise %false. */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) /** * time_before64 - returns true if the time a is before time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before time b, otherwise %false. */ #define time_before64(a,b) time_after64(b,a) /** * time_after_eq64 - returns true if the time a is after or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) /** * time_before_eq64 - returns true if the time a is before or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq64(a,b) time_after_eq64(b,a) /** * time_in_range64 - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These eight macros compare jiffies[_64] and 'a' for convenience. */ /** * time_is_before_jiffies - return true if a is before jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before jiffies, otherwise %false. */ #define time_is_before_jiffies(a) time_after(jiffies, a) /** * time_is_before_jiffies64 - return true if a is before jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before jiffies_64, otherwise %false. */ #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /** * time_is_after_jiffies - return true if a is after jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after jiffies, otherwise %false. */ #define time_is_after_jiffies(a) time_before(jiffies, a) /** * time_is_after_jiffies64 - return true if a is after jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after jiffies_64, otherwise %false. */ #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /** * time_is_before_eq_jiffies - return true if a is before or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before or the same as jiffies, otherwise %false. */ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /** * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before or the same jiffies_64, otherwise %false. */ #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /** * time_is_after_eq_jiffies - return true if a is after or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after or the same as jiffies, otherwise %false. */ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /** * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after or the same as jiffies_64, otherwise %false. */ #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32-bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64-bit machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); /** * jiffies_to_nsecs - Convert jiffies to nanoseconds * @j: jiffies value * * Return: nanoseconds value */ static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } /** * secs_to_jiffies: - convert seconds to jiffies * @_secs: time in seconds * * Conversion is done by simple multiplication with HZ * * secs_to_jiffies() is defined as a macro rather than a static inline * function so it can be used in static initializers. * * Return: jiffies value */ #define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 #endif |
3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/sfp.h> #include "netlink.h" #include "common.h" struct eeprom_req_info { struct ethnl_req_info base; u32 offset; u32 length; u8 page; u8 bank; u8 i2c_address; }; struct eeprom_reply_data { struct ethnl_reply_data base; u32 length; u8 *data; }; #define MODULE_EEPROM_REQINFO(__req_base) \ container_of(__req_base, struct eeprom_req_info, base) #define MODULE_EEPROM_REPDATA(__reply_base) \ container_of(__reply_base, struct eeprom_reply_data, base) static int fallback_set_params(struct eeprom_req_info *request, struct ethtool_modinfo *modinfo, struct ethtool_eeprom *eeprom) { u32 offset = request->offset; u32 length = request->length; if (request->page) offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; if (modinfo->type == ETH_MODULE_SFF_8472 && request->i2c_address == 0x51) offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; if (offset >= modinfo->eeprom_len) return -EINVAL; eeprom->cmd = ETHTOOL_GMODULEEEPROM; eeprom->len = length; eeprom->offset = offset; return 0; } static int eeprom_fallback(struct eeprom_req_info *request, struct eeprom_reply_data *reply) { struct net_device *dev = reply->base.dev; struct ethtool_modinfo modinfo = {0}; struct ethtool_eeprom eeprom = {0}; u8 *data; int err; modinfo.cmd = ETHTOOL_GMODULEINFO; err = ethtool_get_module_info_call(dev, &modinfo); if (err < 0) return err; err = fallback_set_params(request, &modinfo, &eeprom); if (err < 0) return err; data = kmalloc(eeprom.len, GFP_KERNEL); if (!data) return -ENOMEM; err = ethtool_get_module_eeprom_call(dev, &eeprom, data); if (err < 0) goto err_out; reply->data = data; reply->length = eeprom.len; return 0; err_out: kfree(data); return err; } static int get_module_eeprom_by_page(struct net_device *dev, struct ethtool_module_eeprom *page_data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } if (dev->sfp_bus) return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack); if (ops->get_module_eeprom_by_page) return ops->get_module_eeprom_by_page(dev, page_data, extack); return -EOPNOTSUPP; } static int eeprom_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); struct ethtool_module_eeprom page_data = {0}; struct net_device *dev = reply_base->dev; int ret; page_data.offset = request->offset; page_data.length = request->length; page_data.i2c_address = request->i2c_address; page_data.page = request->page; page_data.bank = request->bank; page_data.data = kmalloc(page_data.length, GFP_KERNEL); if (!page_data.data) return -ENOMEM; ret = ethnl_ops_begin(dev); if (ret) goto err_free; ret = get_module_eeprom_by_page(dev, &page_data, info->extack); if (ret < 0) goto err_ops; reply->length = ret; reply->data = page_data.data; ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_free: kfree(page_data.data); if (ret == -EOPNOTSUPP) return eeprom_fallback(request, reply); return ret; } static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) return -EINVAL; request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); /* The following set of conditions limit the API to only dump 1/2 * EEPROM page without crossing low page boundary located at offset 128. * This means user may only request dumps of length limited to 128 from * either low 128 bytes or high 128 bytes. * For pages higher than 0 only high 128 bytes are accessible. */ request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], "reading from lower half page is allowed for page 0 only"); return -EINVAL; } if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross half page boundary is illegal"); return -EINVAL; } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross page boundary is illegal"); return -EINVAL; } if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); return 0; } static int eeprom_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ } static int eeprom_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); } static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); kfree(reply->data); } const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, .req_info_size = sizeof(struct eeprom_req_info), .reply_data_size = sizeof(struct eeprom_reply_data), .parse_request = eeprom_parse_request, .prepare_data = eeprom_prepare_data, .reply_size = eeprom_reply_size, .fill_reply = eeprom_fill_reply, .cleanup_data = eeprom_cleanup_data, }; const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_EEPROM_OFFSET] = NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1), [ETHTOOL_A_MODULE_EEPROM_LENGTH] = NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), }; |
7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-only /* * xt_conntrack - Netfilter module to match connection tracking * information. (Superset of Rusty's minimalistic state match.) * * (C) 2001 Marc Boucher (marc@mbsi.ca). * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_conntrack.h> #include <net/netfilter/nf_conntrack.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: connection tracking state match"); MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; else return false; } static inline bool conntrack_mt_origsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &info->origsrc_addr, &info->origsrc_mask, family); } static inline bool conntrack_mt_origdst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, &info->origdst_addr, &info->origdst_mask, family); } static inline bool conntrack_mt_replsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, &info->replsrc_addr, &info->replsrc_mask, family); } static inline bool conntrack_mt_repldst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo2 *info, u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, &info->repldst_addr, &info->repldst_mask, family); } static inline bool ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && (tuple->src.u.all == info->origsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && (tuple->dst.u.all == info->origdst_port) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && (tuple->src.u.all == info->replsrc_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && (tuple->dst.u.all == info->repldst_port) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static inline bool port_match(u16 min, u16 max, u16 port, bool invert) { return (port >= min && port <= max) ^ invert; } static inline bool ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, const struct nf_conn *ct) { const struct nf_conntrack_tuple *tuple; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; if ((info->match_flags & XT_CONNTRACK_PROTO) && (nf_ct_protonum(ct) == info->l4proto) ^ !(info->invert_flags & XT_CONNTRACK_PROTO)) return false; /* Shortcut to match all recognized protocols by using ->src.all. */ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && !port_match(info->origsrc_port, info->origsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && !port_match(info->origdst_port, info->origdst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) return false; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && !port_match(info->replsrc_port, info->replsrc_port_high, ntohs(tuple->src.u.all), info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) return false; if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && !port_match(info->repldst_port, info->repldst_port_high, ntohs(tuple->dst.u.all), info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) return false; return true; } static bool conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 state_mask, u16 status_mask) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int statebit; ct = nf_ct_get(skb, &ctinfo); if (ct) statebit = XT_CONNTRACK_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) statebit = XT_CONNTRACK_STATE_UNTRACKED; else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { if (ct != NULL) { if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_SNAT; if (test_bit(IPS_DST_NAT_BIT, &ct->status)) statebit |= XT_CONNTRACK_STATE_DNAT; } if (!!(state_mask & statebit) ^ !(info->invert_flags & XT_CONNTRACK_STATE)) return false; } if (ct == NULL) return info->match_flags & XT_CONNTRACK_STATE; if ((info->match_flags & XT_CONNTRACK_DIRECTION) && (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ !(info->invert_flags & XT_CONNTRACK_DIRECTION)) return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) if (conntrack_mt_origdst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) if (conntrack_mt_repldst(ct, info, xt_family(par)) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; if (par->match->revision != 3) { if (!ct_proto_port_check(info, ct)) return false; } else { if (!ct_proto_port_check_v3(par->matchinfo, ct)) return false; } if ((info->match_flags & XT_CONNTRACK_STATUS) && (!!(status_mask & ct->status) ^ !(info->invert_flags & XT_CONNTRACK_STATUS))) return false; if (info->match_flags & XT_CONNTRACK_EXPIRES) { unsigned long expires = nf_ct_expires(ct) / HZ; if ((expires >= info->expires_min && expires <= info->expires_max) ^ !(info->invert_flags & XT_CONNTRACK_EXPIRES)) return false; } return true; } static bool conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo1 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo2 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static bool conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_conntrack_mtinfo3 *info = par->matchinfo; return conntrack_mt(skb, par, info->state_mask, info->status_mask); } static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt_v1, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 2, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo2), .match = conntrack_mt_v2, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, { .name = "conntrack", .revision = 3, .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo3), .match = conntrack_mt_v3, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, }; static int __init conntrack_mt_init(void) { return xt_register_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } static void __exit conntrack_mt_exit(void) { xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); } module_init(conntrack_mt_init); module_exit(conntrack_mt_exit); |
3 5 5 2 3 3 3 1 3 2 2 1 3 3 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for the AVX assembler implementation of the Cast6 Cipher * * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/cast6.h> #include <crypto/internal/simd.h> #include "ecb_cbc_helpers.h" #define CAST6_PARALLEL_BLOCKS 8 asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src); static int cast6_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return cast6_setkey(&tfm->base, key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way); ECB_BLOCK(1, __cast6_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way); ECB_BLOCK(1, __cast6_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__cast6_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS); CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way); CBC_DEC_BLOCK(1, __cast6_decrypt); CBC_WALK_END(); } static struct skcipher_alg cast6_algs[] = { { .base.cra_name = "__ecb(cast6)", .base.cra_driver_name = "__ecb-cast6-avx", .base.cra_priority = 200, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST6_MIN_KEY_SIZE, .max_keysize = CAST6_MAX_KEY_SIZE, .setkey = cast6_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(cast6)", .base.cra_driver_name = "__cbc-cast6-avx", .base.cra_priority = 200, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST6_MIN_KEY_SIZE, .max_keysize = CAST6_MAX_KEY_SIZE, .ivsize = CAST6_BLOCK_SIZE, .setkey = cast6_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, }; static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)]; static int __init cast6_init(void) { const char *feature_name; if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(cast6_algs, ARRAY_SIZE(cast6_algs), cast6_simd_algs); } static void __exit cast6_exit(void) { simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs), cast6_simd_algs); } module_init(cast6_init); module_exit(cast6_exit); MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("cast6"); |
1 8 17 5 2 25 368 2 2 5 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef _IP_SET_H #define _IP_SET_H #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/stringify.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <uapi/linux/netfilter/ipset/ip_set.h> #define _IP_SET_MODULE_DESC(a, b, c) \ MODULE_DESCRIPTION(a " type of IP sets, revisions " b "-" c) #define IP_SET_MODULE_DESC(a, b, c) \ _IP_SET_MODULE_DESC(a, __stringify(b), __stringify(c)) /* Set features */ enum ip_set_feature { IPSET_TYPE_IP_FLAG = 0, IPSET_TYPE_IP = (1 << IPSET_TYPE_IP_FLAG), IPSET_TYPE_PORT_FLAG = 1, IPSET_TYPE_PORT = (1 << IPSET_TYPE_PORT_FLAG), IPSET_TYPE_MAC_FLAG = 2, IPSET_TYPE_MAC = (1 << IPSET_TYPE_MAC_FLAG), IPSET_TYPE_IP2_FLAG = 3, IPSET_TYPE_IP2 = (1 << IPSET_TYPE_IP2_FLAG), IPSET_TYPE_NAME_FLAG = 4, IPSET_TYPE_NAME = (1 << IPSET_TYPE_NAME_FLAG), IPSET_TYPE_IFACE_FLAG = 5, IPSET_TYPE_IFACE = (1 << IPSET_TYPE_IFACE_FLAG), IPSET_TYPE_MARK_FLAG = 6, IPSET_TYPE_MARK = (1 << IPSET_TYPE_MARK_FLAG), IPSET_TYPE_NOMATCH_FLAG = 7, IPSET_TYPE_NOMATCH = (1 << IPSET_TYPE_NOMATCH_FLAG), /* Strictly speaking not a feature, but a flag for dumping: * this settype must be dumped last */ IPSET_DUMP_LAST_FLAG = 8, IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG), }; /* Set extensions */ enum ip_set_extension { IPSET_EXT_BIT_TIMEOUT = 0, IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT), IPSET_EXT_BIT_COUNTER = 1, IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), IPSET_EXT_BIT_SKBINFO = 3, IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), }; #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) #define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; struct ip_set; /* Extension type */ struct ip_set_ext_type { /* Destroy extension private data (can be NULL) */ void (*destroy)(struct ip_set *set, void *ext); enum ip_set_extension type; enum ipset_cadt_flags flag; /* Size and minimal alignment */ u8 len; u8 align; }; extern const struct ip_set_ext_type ip_set_extensions[]; struct ip_set_counter { atomic64_t bytes; atomic64_t packets; }; struct ip_set_comment_rcu { struct rcu_head rcu; char str[]; }; struct ip_set_comment { struct ip_set_comment_rcu __rcu *c; }; struct ip_set_skbinfo { u32 skbmark; u32 skbmarkmask; u32 skbprio; u16 skbqueue; u16 __pad; }; struct ip_set_ext { struct ip_set_skbinfo skbinfo; u64 packets; u64 bytes; char *comment; u32 timeout; u8 packets_op; u8 bytes_op; bool target; }; #define ext_timeout(e, s) \ ((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])) #define ext_counter(e, s) \ ((struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])) #define ext_comment(e, s) \ ((struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT])) #define ext_skbinfo(e, s) \ ((struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO])) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 cmdflags); /* Kernel API function options */ struct ip_set_adt_opt { u8 family; /* Actual protocol family */ u8 dim; /* Dimension of match/target */ u8 flags; /* Direction and negation flags */ u32 cmdflags; /* Command-like flags */ struct ip_set_ext ext; /* Extensions */ }; /* Set type, variant-specific part */ struct ip_set_type_variant { /* Kernelspace: test/add/del entries * returns negative error code, * zero for no match/success to add/delete * positive for matching element */ int (*kadt)(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt); /* Userspace: test/add/del entries * returns negative error code, * zero for no match/success to add/delete * positive for matching element */ int (*uadt)(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried); /* Low level add/del/test functions */ ipset_adtfn adt[IPSET_ADT_MAX]; /* When adding entries and set is full, try to resize the set */ int (*resize)(struct ip_set *set, bool retried); /* Destroy the set */ void (*destroy)(struct ip_set *set); /* Flush the elements */ void (*flush)(struct ip_set *set); /* Expire entries before listing */ void (*expire)(struct ip_set *set); /* List set header data */ int (*head)(struct ip_set *set, struct sk_buff *skb); /* List elements */ int (*list)(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb); /* Keep listing private when resizing runs parallel */ void (*uref)(struct ip_set *set, struct netlink_callback *cb, bool start); /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); /* Cancel ongoing garbage collectors before destroying the set*/ void (*cancel_gc)(struct ip_set *set); /* Region-locking is used */ bool region_lock; }; struct ip_set_region { spinlock_t lock; /* Region lock */ size_t ext_size; /* Size of the dynamic extensions */ u32 elements; /* Number of elements vs timeout */ }; /* Max range where every element is added/deleted in one step */ #define IPSET_MAX_RANGE (1<<14) /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 /* The core set type structure */ struct ip_set_type { struct list_head list; /* Typename */ char name[IPSET_MAXNAMELEN]; /* Protocol version */ u8 protocol; /* Set type dimension */ u8 dimension; /* * Supported family: may be NFPROTO_UNSPEC for both * NFPROTO_IPV4/NFPROTO_IPV6. */ u8 family; /* Type revisions */ u8 revision_min, revision_max; /* Revision-specific supported (create) flags */ u8 create_flags[IPSET_REVISION_MAX+1]; /* Set features to control swapping */ u16 features; /* Create set */ int (*create)(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags); /* Attribute policies */ const struct nla_policy create_policy[IPSET_ATTR_CREATE_MAX + 1]; const struct nla_policy adt_policy[IPSET_ATTR_ADT_MAX + 1]; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; }; /* register and unregister set type */ extern int ip_set_type_register(struct ip_set_type *set_type); extern void ip_set_type_unregister(struct ip_set_type *set_type); /* A generic IP set */ struct ip_set { /* For call_cru in destroy */ struct rcu_head rcu; /* The name of the set */ char name[IPSET_MAXNAMELEN]; /* Lock protecting the set data */ spinlock_t lock; /* References to the set */ u32 ref; /* References to the set for netlink events like dump, * ref can be swapped out by ip_set_swap */ u32 ref_netlink; /* The core set type */ struct ip_set_type *type; /* The type variant doing the real job */ const struct ip_set_type_variant *variant; /* The actual INET family of the set */ u8 family; /* The type revision */ u8 revision; /* Extensions */ u8 extensions; /* Create flags */ u8 flags; /* Default timeout value, if enabled */ u32 timeout; /* Number of elements (vs timeout) */ u32 elements; /* Size of the dynamic extensions (vs timeout) */ size_t ext_size; /* Element data size */ size_t dsize; /* Offsets to extensions in elements */ size_t offset[IPSET_EXT_ID_MAX]; /* The type specific data */ void *data; }; static inline void ip_set_ext_destroy(struct ip_set *set, void *data) { /* Check that the extension is enabled for the set and * call it's destroy function for its extension part in data. */ if (SET_WITH_COMMENT(set)) { struct ip_set_comment *c = ext_comment(data, set); ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(set, c); } } int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set); /* Netlink CB args */ enum { IPSET_CB_NET = 0, /* net namespace */ IPSET_CB_PROTO, /* ipset protocol */ IPSET_CB_DUMP, /* dump single set/all sets */ IPSET_CB_INDEX, /* set index */ IPSET_CB_PRIVATE, /* set private data */ IPSET_CB_ARG0, /* type specific */ }; /* register and unregister set references */ extern ip_set_id_t ip_set_get_byname(struct net *net, const char *name, struct ip_set **set); extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index); /* API for iptables set match, and SET target */ extern int ip_set_add(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); extern int ip_set_del(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt); /* Utility functions */ extern void *ip_set_alloc(size_t size); extern void ip_set_free(void *members); extern int ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr); extern int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr); extern size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len, size_t align); extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext); extern int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, const void *e, bool active); extern bool ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags, void *data); static inline int ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr) { __be32 ip; int ret = ip_set_get_ipaddr4(nla, &ip); if (ret) return ret; *ipaddr = ntohl(ip); return 0; } /* Ignore IPSET_ERR_EXIST errors if asked to do so? */ static inline bool ip_set_eexist(int ret, u32 flags) { return ret == -IPSET_ERR_EXIST && (flags & IPSET_FLAG_EXIST); } /* Match elements marked with nomatch */ static inline bool ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt, struct ip_set *set) { return adt == IPSET_TEST && (set->type->features & IPSET_TYPE_NOMATCH) && ((flags >> 16) & IPSET_FLAG_NOMATCH) && (ret > 0 || ret == -ENOTEMPTY); } /* Check the NLA_F_NET_BYTEORDER flag */ static inline bool ip_set_attr_netorder(struct nlattr *tb[], int type) { return tb[type] && (tb[type]->nla_type & NLA_F_NET_BYTEORDER); } static inline bool ip_set_optattr_netorder(struct nlattr *tb[], int type) { return !tb[type] || (tb[type]->nla_type & NLA_F_NET_BYTEORDER); } /* Useful converters */ static inline u32 ip_set_get_h32(const struct nlattr *attr) { return ntohl(nla_get_be32(attr)); } static inline u16 ip_set_get_h16(const struct nlattr *attr) { return ntohs(nla_get_be16(attr)); } static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr) { struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in_addr(skb, IPSET_ATTR_IPADDR_IPV4, ipaddr); if (!ret) nla_nest_end(skb, __nested); return ret; } static inline int nla_put_ipaddr6(struct sk_buff *skb, int type, const struct in6_addr *ipaddrptr) { struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in6_addr(skb, IPSET_ATTR_IPADDR_IPV6, ipaddrptr); if (!ret) nla_nest_end(skb, __nested); return ret; } /* Get address from skbuff */ static inline __be32 ip4addr(const struct sk_buff *skb, bool src) { return src ? ip_hdr(skb)->saddr : ip_hdr(skb)->daddr; } static inline void ip4addrptr(const struct sk_buff *skb, bool src, __be32 *addr) { *addr = src ? ip_hdr(skb)->saddr : ip_hdr(skb)->daddr; } static inline void ip6addrptr(const struct sk_buff *skb, bool src, struct in6_addr *addr) { memcpy(addr, src ? &ipv6_hdr(skb)->saddr : &ipv6_hdr(skb)->daddr, sizeof(*addr)); } /* How often should the gc be run by default */ #define IPSET_GC_TIME (3 * 60) /* Timeout period depending on the timeout value of the given set */ #define IPSET_GC_PERIOD(timeout) \ ((timeout/3) ? min_t(u32, (timeout)/3, IPSET_GC_TIME) : 1) /* Entry is set with no timeout value */ #define IPSET_ELEM_PERMANENT 0 /* Set is defined with timeout support: timeout value may be 0 */ #define IPSET_NO_TIMEOUT UINT_MAX /* Max timeout value, see msecs_to_jiffies() in jiffies.h */ #define IPSET_MAX_TIMEOUT (UINT_MAX >> 1)/MSEC_PER_SEC #define ip_set_adt_opt_timeout(opt, set) \ ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout) static inline unsigned int ip_set_timeout_uget(struct nlattr *tb) { unsigned int timeout = ip_set_get_h32(tb); /* Normalize to fit into jiffies */ if (timeout > IPSET_MAX_TIMEOUT) timeout = IPSET_MAX_TIMEOUT; return timeout; } static inline bool ip_set_timeout_expired(const unsigned long *t) { return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t); } static inline void ip_set_timeout_set(unsigned long *timeout, u32 value) { unsigned long t; if (!value) { *timeout = IPSET_ELEM_PERMANENT; return; } t = msecs_to_jiffies(value * MSEC_PER_SEC) + jiffies; if (t == IPSET_ELEM_PERMANENT) /* Bingo! :-) */ t--; *timeout = t; } void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext); static inline void ip_set_init_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext) { if (ext->bytes != ULLONG_MAX) atomic64_set(&(counter)->bytes, (long long)(ext->bytes)); if (ext->packets != ULLONG_MAX) atomic64_set(&(counter)->packets, (long long)(ext->packets)); } static inline void ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext) { *skbinfo = ext->skbinfo; } static inline void nf_inet_addr_mask_inplace(union nf_inet_addr *a1, const union nf_inet_addr *mask) { a1->all[0] &= mask->all[0]; a1->all[1] &= mask->all[1]; a1->all[2] &= mask->all[2]; a1->all[3] &= mask->all[3]; } #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ { .bytes = ULLONG_MAX, .packets = ULLONG_MAX, \ .timeout = (set)->timeout } #define IPSET_CONCAT(a, b) a##b #define IPSET_TOKEN(a, b) IPSET_CONCAT(a, b) #endif /*_IP_SET_H */ |
5 108 49 45 44 42 257 76 37 1 55 8 148 148 76 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 | /* * net/tipc/trace.h: TIPC tracepoints * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tipc #if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TIPC_TRACE_H #include <linux/tracepoint.h> #include "core.h" #include "link.h" #include "socket.h" #include "node.h" #define SKB_LMIN (100) #define SKB_LMAX (SKB_LMIN * 2) #define LIST_LMIN (SKB_LMIN * 3) #define LIST_LMAX (SKB_LMIN * 11) #define SK_LMIN (SKB_LMIN * 2) #define SK_LMAX (SKB_LMIN * 11) #define LINK_LMIN (SKB_LMIN) #define LINK_LMAX (SKB_LMIN * 16) #define NODE_LMIN (SKB_LMIN) #define NODE_LMAX (SKB_LMIN * 11) #ifndef __TIPC_TRACE_ENUM #define __TIPC_TRACE_ENUM enum { TIPC_DUMP_NONE = 0, TIPC_DUMP_TRANSMQ = 1, TIPC_DUMP_BACKLOGQ = (1 << 1), TIPC_DUMP_DEFERDQ = (1 << 2), TIPC_DUMP_INPUTQ = (1 << 3), TIPC_DUMP_WAKEUP = (1 << 4), TIPC_DUMP_SK_SNDQ = (1 << 8), TIPC_DUMP_SK_RCVQ = (1 << 9), TIPC_DUMP_SK_BKLGQ = (1 << 10), TIPC_DUMP_ALL = 0xffffu }; #endif /* Link & Node FSM states: */ #define state_sym(val) \ __print_symbolic(val, \ {(0xe), "ESTABLISHED" },\ {(0xe << 4), "ESTABLISHING" },\ {(0x1 << 8), "RESET" },\ {(0x2 << 12), "RESETTING" },\ {(0xd << 16), "PEER_RESET" },\ {(0xf << 20), "FAILINGOVER" },\ {(0xc << 24), "SYNCHING" },\ {(0xdd), "SELF_DOWN_PEER_DOWN" },\ {(0xaa), "SELF_UP_PEER_UP" },\ {(0xd1), "SELF_DOWN_PEER_LEAVING" },\ {(0xac), "SELF_UP_PEER_COMING" },\ {(0xca), "SELF_COMING_PEER_UP" },\ {(0x1d), "SELF_LEAVING_PEER_DOWN" },\ {(0xf0), "FAILINGOVER" },\ {(0xcc), "SYNCHING" }) /* Link & Node FSM events: */ #define evt_sym(val) \ __print_symbolic(val, \ {(0xec1ab1e), "ESTABLISH_EVT" },\ {(0x9eed0e), "PEER_RESET_EVT" },\ {(0xfa110e), "FAILURE_EVT" },\ {(0x10ca1d0e), "RESET_EVT" },\ {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\ {(0xfa110ede), "FAILOVER_END_EVT" },\ {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\ {(0xc1ccede), "SYNCH_END_EVT" },\ {(0xece), "SELF_ESTABL_CONTACT_EVT" },\ {(0x1ce), "SELF_LOST_CONTACT_EVT" },\ {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\ {(0x91ce), "PEER_LOST_CONTACT_EVT" },\ {(0xfbe), "FAILOVER_BEGIN_EVT" },\ {(0xfee), "FAILOVER_END_EVT" },\ {(0xcbe), "SYNCH_BEGIN_EVT" },\ {(0xcee), "SYNCH_END_EVT" }) /* Bearer, net device events: */ #define dev_evt_sym(val) \ __print_symbolic(val, \ {(NETDEV_CHANGE), "NETDEV_CHANGE" },\ {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\ {(NETDEV_UP), "NETDEV_UP" },\ {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\ {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\ {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\ {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" }) extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly; int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf); int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf); int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf); int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf); int tipc_node_dump(struct tipc_node *n, bool more, char *buf); bool tipc_sk_filtering(struct sock *sk); DECLARE_EVENT_CLASS(tipc_skb_class, TP_PROTO(struct sk_buff *skb, bool more, const char *header), TP_ARGS(skb, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN) ), TP_fast_assign( __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ) #define DEFINE_SKB_EVENT(name) \ DEFINE_EVENT(tipc_skb_class, name, \ TP_PROTO(struct sk_buff *skb, bool more, const char *header), \ TP_ARGS(skb, more, header)) DEFINE_SKB_EVENT(tipc_skb_dump); DEFINE_SKB_EVENT(tipc_proto_build); DEFINE_SKB_EVENT(tipc_proto_rcv); DECLARE_EVENT_CLASS(tipc_list_class, TP_PROTO(struct sk_buff_head *list, bool more, const char *header), TP_ARGS(list, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN) ), TP_fast_assign( __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ); #define DEFINE_LIST_EVENT(name) \ DEFINE_EVENT(tipc_list_class, name, \ TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \ TP_ARGS(list, more, header)) DEFINE_LIST_EVENT(tipc_list_dump); DECLARE_EVENT_CLASS(tipc_sk_class, TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, const char *header), TP_ARGS(sk, skb, dqueues, header), TP_STRUCT__entry( __string(header, header) __field(u32, portid) __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN) __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1) ), TP_fast_assign( __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) tipc_skb_dump(skb, false, __get_str(skb_buf)); else *(__get_str(skb_buf)) = '\0'; ), TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header), __get_str(skb_buf), __get_str(buf)) ); #define DEFINE_SK_EVENT_FILTER(name) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk))) DEFINE_SK_EVENT_FILTER(tipc_sk_dump); DEFINE_SK_EVENT_FILTER(tipc_sk_create); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg); DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream); DEFINE_SK_EVENT_FILTER(tipc_sk_poll); DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv); DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx); DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_release); DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown); #define DEFINE_SK_EVENT_FILTER_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk) && (cond))) DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb)); DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb)); DECLARE_EVENT_CLASS(tipc_link_class, TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), TP_ARGS(l, dqueues, header), TP_STRUCT__entry( __string(header, header) __array(char, name, TIPC_MAX_LINK_NAME) __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN) ), TP_fast_assign( __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), TP_printk("<%s> %s\n%s", __entry->name, __get_str(header), __get_str(buf)) ); #define DEFINE_LINK_EVENT(name) \ DEFINE_EVENT(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header)) DEFINE_LINK_EVENT(tipc_link_dump); DEFINE_LINK_EVENT(tipc_link_conges); DEFINE_LINK_EVENT(tipc_link_timeout); DEFINE_LINK_EVENT(tipc_link_reset); #define DEFINE_LINK_EVENT_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header), \ TP_CONDITION(cond)) DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l)); DECLARE_EVENT_CLASS(tipc_link_transmq_class, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_STRUCT__entry( __array(char, name, TIPC_MAX_LINK_NAME) __field(u16, from) __field(u16, to) __field(u32, len) __field(u16, fseqno) __field(u16, lseqno) ), TP_fast_assign( memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); __entry->fseqno = __entry->len ? msg_seqno(buf_msg(skb_peek(tq))) : 0; __entry->lseqno = __entry->len ? msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DECLARE_EVENT_CLASS(tipc_node_class, TP_PROTO(struct tipc_node *n, bool more, const char *header), TP_ARGS(n, more, header), TP_STRUCT__entry( __string(header, header) __field(u32, addr) __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN) ), TP_fast_assign( __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header), __get_str(buf)) ); #define DEFINE_NODE_EVENT(name) \ DEFINE_EVENT(tipc_node_class, name, \ TP_PROTO(struct tipc_node *n, bool more, const char *header), \ TP_ARGS(n, more, header)) DEFINE_NODE_EVENT(tipc_node_dump); DEFINE_NODE_EVENT(tipc_node_create); DEFINE_NODE_EVENT(tipc_node_delete); DEFINE_NODE_EVENT(tipc_node_lost_contact); DEFINE_NODE_EVENT(tipc_node_timeout); DEFINE_NODE_EVENT(tipc_node_link_up); DEFINE_NODE_EVENT(tipc_node_link_down); DEFINE_NODE_EVENT(tipc_node_reset_links); DEFINE_NODE_EVENT(tipc_node_check_state); DECLARE_EVENT_CLASS(tipc_fsm_class, TP_PROTO(const char *name, u32 os, u32 ns, int evt), TP_ARGS(name, os, ns, evt), TP_STRUCT__entry( __string(name, name) __field(u32, os) __field(u32, ns) __field(u32, evt) ), TP_fast_assign( __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; ), TP_printk("<%s> %s--(%s)->%s\n", __get_str(name), state_sym(__entry->os), evt_sym(__entry->evt), state_sym(__entry->ns)) ); #define DEFINE_FSM_EVENT(fsm_name) \ DEFINE_EVENT(tipc_fsm_class, fsm_name, \ TP_PROTO(const char *name, u32 os, u32 ns, int evt), \ TP_ARGS(name, os, ns, evt)) DEFINE_FSM_EVENT(tipc_link_fsm); DEFINE_FSM_EVENT(tipc_node_fsm); TRACE_EVENT(tipc_l2_device_event, TP_PROTO(struct net_device *dev, struct tipc_bearer *b, unsigned long evt), TP_ARGS(dev, b, evt), TP_STRUCT__entry( __string(dev_name, dev->name) __string(b_name, b->name) __field(unsigned long, evt) __field(u8, b_up) __field(u8, carrier) __field(u8, oper) ), TP_fast_assign( __assign_str(dev_name); __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); __entry->oper = netif_oper_up(dev); ), TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n", dev_evt_sym(__entry->evt), __get_str(dev_name), __get_str(b_name), (__entry->oper) ? "up" : "down", (__entry->carrier) ? "ok" : "notok", (__entry->b_up) ? "up" : "down") ); #endif /* _TIPC_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
35292 10 35291 35289 19524 812 811 811 31357 1411 1413 1414 8200 31356 4438 4405 4907 29284 29109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor contexts used to associate "labels" to objects. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_CONTEXT_H #define __AA_CONTEXT_H #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include "label.h" #include "policy_ns.h" #include "task.h" static inline struct aa_label *cred_label(const struct cred *cred) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); return *blob; } static inline void set_cred_label(const struct cred *cred, struct aa_label *label) { struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred; AA_BUG(!blob); *blob = label; } /** * aa_cred_raw_label - obtain cred's label * @cred: cred to obtain label from (NOT NULL) * * Returns: confining label * * does NOT increment reference count */ static inline struct aa_label *aa_cred_raw_label(const struct cred *cred) { struct aa_label *label = cred_label(cred); AA_BUG(!label); return label; } /** * aa_get_newest_cred_label - obtain the newest label on a cred * @cred: cred to obtain label from (NOT NULL) * * Returns: newest version of confining label */ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) { return aa_get_newest_label(aa_cred_raw_label(cred)); } static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred, bool *needput) { struct aa_label *l = aa_cred_raw_label(cred); if (unlikely(label_is_stale(l))) { *needput = true; return aa_get_newest_label(l); } *needput = false; return l; } static inline void aa_put_label_condref(struct aa_label *l, bool needput) { if (unlikely(needput)) aa_put_label(l); } /** * aa_current_raw_label - find the current tasks confining label * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * This fn will not update the tasks cred to the most up to date version * of the label so it is safe to call when inside of locks. */ static inline struct aa_label *aa_current_raw_label(void) { return aa_cred_raw_label(current_cred()); } /** * aa_get_current_label - get the newest version of the current tasks label * * Returns: newest version of confining label (NOT NULL) * * This fn will not update the tasks cred, so it is safe inside of locks * * The returned reference must be put with aa_put_label() */ static inline struct aa_label *aa_get_current_label(void) { struct aa_label *l = aa_current_raw_label(); if (label_is_stale(l)) return aa_get_newest_label(l); return aa_get_label(l); } #define __end_current_label_crit_section(X) end_current_label_crit_section(X) /** * end_label_crit_section - put a reference found with begin_current_label.. * @label: label reference to put * * Should only be used with a reference obtained with * begin_current_label_crit_section and never used in situations where the * task cred may be updated */ static inline void end_current_label_crit_section(struct aa_label *label) { if (label != aa_current_raw_label()) aa_put_label(label); } /** * __begin_current_label_crit_section - current's confining label * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * safe to call inside locks * * The returned reference must be put with __end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between __begin_current_label_crit_section() .. * __end_current_label_crit_section() */ static inline struct aa_label *__begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); if (label_is_stale(label)) label = aa_get_newest_label(label); return label; } /** * begin_current_label_crit_section - current's confining label and update it * * Returns: up to date confining label or the ns unconfined label (NOT NULL) * * Not safe to call inside locks * * The returned reference must be put with end_current_label_crit_section() * This must NOT be used if the task cred could be updated within the * critical section between begin_current_label_crit_section() .. * end_current_label_crit_section() */ static inline struct aa_label *begin_current_label_crit_section(void) { struct aa_label *label = aa_current_raw_label(); might_sleep(); if (label_is_stale(label)) { label = aa_get_newest_label(label); if (aa_replace_current_label(label) == 0) /* task cred will keep the reference */ aa_put_label(label); } return label; } static inline struct aa_ns *aa_get_current_ns(void) { struct aa_label *label; struct aa_ns *ns; label = __begin_current_label_crit_section(); ns = aa_get_ns(labels_ns(label)); __end_current_label_crit_section(label); return ns; } #endif /* __AA_CONTEXT_H */ |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver */ #include <linux/gfp.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> #include <linux/unaligned.h> #include "aoe.h" #define NECODES 5 static char *aoe_errlist[] = { "no such error", "unrecognized command code", "bad argument parameter", "device unavailable", "config string present", "unsupported version" }; enum { IFLISTSZ = 1024, }; static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); static wait_queue_head_t txwq; static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { strscpy(aoe_iflist, str, IFLISTSZ); return 1; } __setup("aoe_iflist=", aoe_iflist_setup); #endif static spinlock_t txlock; static struct sk_buff_head skbtxq; /* enters with txlock held */ static int tx(int id) __must_hold(&txlock) { struct sk_buff *skb; struct net_device *ifp; while ((skb = skb_dequeue(&skbtxq))) { spin_unlock_irq(&txlock); ifp = skb->dev; if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); dev_put(ifp); spin_lock_irq(&txlock); } return 0; } int is_aoe_netif(struct net_device *ifp) { register char *p, *q; register int len; if (aoe_iflist[0] == '\0') return 1; p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); for (; *p; p = q + strspn(q, WHITESPACE)) { q = p + strcspn(p, WHITESPACE); if (q != p) len = q - p; else len = strlen(p); /* last token in aoe_iflist */ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) return 1; if (q == p) break; } return 0; } int set_aoe_iflist(const char __user *user_str, size_t size) { if (size >= IFLISTSZ) return -EINVAL; if (copy_from_user(aoe_iflist, user_str, size)) { printk(KERN_INFO "aoe: copy from user failed\n"); return -EFAULT; } aoe_iflist[size] = 0x00; return 0; } void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); spin_lock_irqsave(&txlock, flags); skb_queue_tail(&skbtxq, skb); spin_unlock_irqrestore(&txlock, flags); wake_up(&txwq); } } /* * (1) len doesn't include the header by default. I want this. */ static int aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; struct aoe_atahdr *ah; u32 n; int sn; if (dev_net(ifp) != &init_net) goto exit; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; if (!is_aoe_netif(ifp)) goto exit; skb_push(skb, ETH_HLEN); /* (1) */ sn = sizeof(*h) + sizeof(*ah); if (skb->len >= sn) { sn -= skb_headlen(skb); if (sn > 0 && !__pskb_pull_tail(skb, sn)) goto exit; } h = (struct aoe_hdr *) skb->data; n = get_unaligned_be32(&h->tag); if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) goto exit; if (h->verfl & AOEFL_ERR) { n = h->err; if (n > NECODES) n = 0; if (net_ratelimit()) printk(KERN_ERR "%s%d.%d@%s; ecode=%d '%s'\n", "aoe: error packet from ", get_unaligned_be16(&h->major), h->minor, skb->dev->name, h->err, aoe_errlist[n]); goto exit; } switch (h->cmd) { case AOECMD_ATA: /* ata_rsp may keep skb for later processing or give it back */ skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); break; default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); break; } if (!skb) return 0; exit: dev_kfree_skb(skb); return 0; } static struct packet_type aoe_pt __read_mostly = { .type = __constant_htons(ETH_P_AOE), .func = aoenet_rcv, }; int __init aoenet_init(void) { skb_queue_head_init(&skbtxq); init_waitqueue_head(&txwq); spin_lock_init(&txlock); kts.lock = &txlock; kts.fn = tx; kts.waitq = &txwq; kts.id = 0; snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); if (aoe_ktstart(&kts)) return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } void aoenet_exit(void) { aoe_ktstop(&kts); skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); } |
3 3 3 3 3 3 3 3 3 3 59 59 57 59 58 59 59 58 59 3 3 3 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic TIME_WAIT sockets functions * * From code orinally in TCP */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer * * unhash a timewait socket from bind hash, if hashed. * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind2_bucket *tb2 = tw->tw_tb2; struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) return; __sk_del_bind_node((struct sock *)tw); tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); struct inet_bind_hashbucket *bhead, *bhead2; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw, twsk_net(tw), tw->tw_num); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); refcount_dec(&tw->tw_dr->tw_refcount); inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } void inet_twsk_put(struct inet_timewait_sock *tw) { if (refcount_dec_and_test(&tw->tw_refcnt)) inet_twsk_free(tw); } EXPORT_SYMBOL_GPL(inet_twsk_put); static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&tw->tw_node, list); } static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, false); } /* * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. * * The caller must not access @tw anymore after this function returns. */ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo, int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead, *bhead2; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); local_bh_disable(); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); spin_lock(lock); /* Step 2: Hash TW into tcp ehash chain */ inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); /* Ensure above writes are committed into memory before updating the * refcount. * Provides ordering vs later refcount_inc(). */ smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); inet_twsk_schedule(tw, timeo); spin_unlock(lock); local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); inet_twsk_kill(tw); } struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state) { struct inet_timewait_sock *tw; if (refcount_read(&dr->tw_refcount) - 1 >= READ_ONCE(dr->sysctl_max_tw_buckets)) return NULL; tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; tw->tw_tos = inet->tos; tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; tw->tw_sport = inet->inet_sport; tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); } return tw; } EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. * Warning : consume reference. * Caller should not access tw anymore. */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); /* inet_twsk_purge() walks over all sockets, including tw ones, * and removes them via inet_twsk_deschedule_put() after a * refcount_inc_not_zero(). * * inet_twsk_hashdance_schedule() must (re)init the refcount before * arming the timer, i.e. inet_twsk_purge can obtain a reference to * a twsk that did not yet schedule the timer. * * The ehash lock synchronizes these two: * After acquiring the lock, the timer is always scheduled (else * timer_shutdown returns false), because hashdance_schedule releases * the ehash lock only after completing the timer initialization. * * Without grabbing the ehash lock, we get: * 1) cpu x sets twsk refcount to 3 * 2) cpu y bumps refcount to 4 * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown * -> timer refcount is never decremented. */ spin_lock(lock); /* Makes sure hashdance_schedule() has completed */ spin_unlock(lock); if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } EXPORT_SYMBOL(inet_twsk_deschedule_put); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. * * RATIONALE: if FIN arrived and we entered TIME-WAIT state, * our ACK acking that FIN can be lost. If N subsequent retransmitted * FINs (or previous seqments) are lost (probability of such event * is p^(N+1), where p is probability to lose single packet and * time to detect the loss is about RTO*(2^N - 1) with exponential * backoff). Normal timewait length is calculated so, that we * waited at least for one retransmitted FIN (maximal RTO is 120sec). * [ BTW Linux. following BSD, violates this requirement waiting * only for 60sec, we should wait at least for 240 secs. * Well, 240 consumes too much of resources 8) * ] * This interval is not reduced to catch old duplicate and * responces to our wandering segments living for two MSLs. * However, if we use PAWS to detect * old duplicates, we can reduce the interval to bounds required * by RTO, rather than MSL. So, if peer understands PAWS, we * kill tw bucket after 3.5*RTO (it is important that this number * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ if (!rearm) { bool kill = timeo <= 4*HZ; __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : LINUX_MIB_TIMEWAITED); BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); refcount_inc(&tw->tw_dr->tw_refcount); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) { struct inet_ehash_bucket *head = &hashinfo->ehash[0]; unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; for (slot = 0; slot <= ehash_mask; slot++, head++) { if (hlist_nulls_empty(&head->chain)) continue; restart_rcu: cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { int state = inet_sk_state_load(sk); if ((1 << state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV)) continue; if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } rcu_read_unlock(); local_bh_disable(); if (state == TCP_TIME_WAIT) { inet_twsk_deschedule_put(inet_twsk(sk)); } else { struct request_sock *req = inet_reqsk(sk); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); } local_bh_enable(); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto restart; rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(inet_twsk_purge); |
2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/rose.h> static void rose_heartbeat_expiry(struct timer_list *t); static void rose_timer_expiry(struct timer_list *); static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); goto out; } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it is accepted() it isn't 'dead' so doesn't get removed. */ if (sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); sock_put(sk); return; } break; case ROSE_STATE_3: /* * Check for the state of the receive buffer. */ if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && (rose->condition & ROSE_COND_OWN_RX_BUSY)) { rose->condition &= ~ROSE_COND_OWN_RX_BUSY; rose->condition &= ~ROSE_COND_ACK_PENDING; rose->vl = rose->vr; rose_write_internal(sk, ROSE_RR); rose_stop_timer(sk); /* HB */ break; } break; } rose_start_heartbeat(sk); out: bh_unlock_sock(sk); sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) { struct rose_sock *rose = from_timer(rose, t, timer); struct sock *sk = &rose->sock; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); goto out; } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose->state = ROSE_STATE_2; rose_start_t3timer(sk); break; case ROSE_STATE_2: /* T3 */ rose->neighbour->use--; rose_disconnect(sk, ETIMEDOUT, -1, -1); break; case ROSE_STATE_3: /* HB */ if (rose->condition & ROSE_COND_ACK_PENDING) { rose->condition &= ~ROSE_COND_ACK_PENDING; rose_enquiry_response(sk); } break; } out: bh_unlock_sock(sk); sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) { struct rose_sock *rose = from_timer(rose, t, idletimer); struct sock *sk = &rose->sock; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); goto out; } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose_sk(sk)->state = ROSE_STATE_2; rose_start_t3timer(sk); sk->sk_state = TCP_CLOSE; sk->sk_err = 0; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } out: bh_unlock_sock(sk); sock_put(sk); } |
20 18 3 14 3 3 3 5 5 5 5 5 5 5 2 1 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/spinlock.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> static LIST_HEAD(ax25_dev_list); DEFINE_SPINLOCK(ax25_dev_lock); ax25_dev *ax25_addr_ax25dev(ax25_address *addr) { ax25_dev *ax25_dev, *res = NULL; spin_lock_bh(&ax25_dev_lock); list_for_each_entry(ax25_dev, &ax25_dev_list, list) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; ax25_dev_hold(ax25_dev); break; } spin_unlock_bh(&ax25_dev_lock); return res; } /* * This is called when an interface is brought up. These are * reasonable defaults. */ void ax25_dev_device_up(struct net_device *dev) { ax25_dev *ax25_dev; ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL); if (!ax25_dev) { printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); return; } refcount_set(&ax25_dev->refcount, 1); ax25_dev->dev = dev; netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL); ax25_dev->forward = NULL; ax25_dev->device_up = true; ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; #ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; #endif #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); rcu_assign_pointer(dev->ax25_ptr, ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); } void ax25_dev_device_down(struct net_device *dev) { ax25_dev *s, *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; ax25_unregister_dev_sysctl(ax25_dev); spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* * Remove any packet forwarding that points to this device. */ list_for_each_entry(s, &ax25_dev_list, list) if (s->forward == dev) s->forward = NULL; list_for_each_entry(s, &ax25_dev_list, list) { if (s == ax25_dev) { list_del(&s->list); break; } } RCU_INIT_POINTER(dev->ax25_ptr, NULL); spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) { ax25_dev *ax25_dev, *fwd_dev; if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL) return -EINVAL; switch (cmd) { case SIOCAX25ADDFWD: fwd_dev = ax25_addr_ax25dev(&fwd->port_to); if (!fwd_dev) { ax25_dev_put(ax25_dev); return -EINVAL; } if (ax25_dev->forward) { ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: if (!ax25_dev->forward) { ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = NULL; ax25_dev_put(ax25_dev); break; default: ax25_dev_put(ax25_dev); return -EINVAL; } return 0; } struct net_device *ax25_fwd_dev(struct net_device *dev) { ax25_dev *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return dev; if (ax25_dev->forward == NULL) return dev; return ax25_dev->forward; } /* * Free all memory associated with device structures. */ void __exit ax25_dev_free(void) { ax25_dev *s, *n; spin_lock_bh(&ax25_dev_lock); list_for_each_entry_safe(s, n, &ax25_dev_list, list) { netdev_put(s->dev, &s->dev_tracker); list_del(&s->list); ax25_dev_put(s); } spin_unlock_bh(&ax25_dev_lock); } |
13 2 3 5 5 1 3 1 1 10 10 11 9 2 10 8 8 8 10 10 9 1 16 1 1 2 1 7 7 13 12 2 11 13 1 3 1 6 6 6 2 1 1 4 2 1 1 1 2 2 24 2 9 15 3 21 22 22 8 22 14 10 2 8 2 8 14 22 3 8 1 2 2 4 1 5 5 2 3 2 3 8 6 1 1 76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 | // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/ioctls.h> #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip_net_id; struct l2tp_ip_net { rwlock_t l2tp_ip_lock; struct hlist_head l2tp_ip_table; struct hlist_head l2tp_ip_bind_table; }; struct l2tp_ip_sock { /* inet_sock has to be the first member of l2tp_ip_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; }; static struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) { return (struct l2tp_ip_sock *)sk; } static struct l2tp_ip_net *l2tp_ip_pernet(const struct net *net) { return net_generic(net, l2tp_ip_net_id); } static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, __be32 raddr, int dif, u32 tunnel_id) { struct l2tp_ip_net *pn = l2tp_ip_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (inet->inet_rcv_saddr && laddr && inet->inet_rcv_saddr != laddr) continue; if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; pn = l2tp_ip_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&pn->l2tp_ip_lock); sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip_hash(struct sock *sk) { struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip_lock); sk_add_node(sk, &pn->l2tp_ip_table); write_unlock_bh(&pn->l2tp_ip_lock); } return 0; } static void l2tp_ip_unhash(struct sock *sk) { struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip_lock); } static int l2tp_ip_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip_hash(sk); return 0; } static void l2tp_ip_close(struct sock *sk, long timeout) { struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip_lock); sk_common_release(sk); } static void l2tp_ip_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; __skb_queue_purge(&sk->sk_write_queue); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip_net *pn; int ret; int chk_addr_ret; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) return -EINVAL; lock_sock(sk); ret = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; if (addr->l2tp_addr.s_addr) { inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; inet->inet_saddr = addr->l2tp_addr.s_addr; } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ pn = l2tp_ip_pernet(net); write_lock_bh(&pn->l2tp_ip_lock); if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip_lock); ret = -EADDRINUSE; goto out; } sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip_lock); ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); out: release_sock(sk); return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); int rc; if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip4_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&pn->l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip_bind_table); write_unlock_bh(&pn->l2tp_ip_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; memset(lsa, 0, sizeof(*lsa)); lsa->l2tp_family = AF_INET; if (peer) { if (!inet->inet_dport) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct sk_buff *skb; int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; int connected = 0; __be32 daddr; lock_sock(sk); rc = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD)) goto out; /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; if (lip->l2tp_family != AF_INET) { rc = -EAFNOSUPPORT; if (lip->l2tp_family != AF_UNSPEC) goto out; } daddr = lip->l2tp_addr.s_addr; } else { rc = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; connected = 1; } /* Allocate a socket buffer */ rc = -ENOMEM; skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) + 4 + len, 0, GFP_KERNEL); if (!skb) goto error; /* Reserve space for headers, putting IP header on 4-byte boundary. */ skb_reserve(skb, 2 + NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); /* Insert 0 session_id */ *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); goto error; } if (connected) rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { struct flowi4 *fl4 = &inet->cork.fl.u.ip4; inet_sk_init_flowi4(inet, fl4); /* Overwrite ->daddr if msg->msg_name was provided */ if (!connected) fl4->daddr = daddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) goto no_route; if (connected) { sk_setup_caps(sk, &rt->dst); } else { skb_dst_set(skb, &rt->dst); goto xmit; } } /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); xmit: /* Queue the packet to IP for output */ rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: if (rc >= 0) rc = len; out: release_sock(sk); return rc; no_route: rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); rc = -EHOSTUNREACH; goto out; } static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCOUTQ: *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; default: return -ENOIOCTLCMD; } return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, .close = l2tp_ip_close, .bind = l2tp_ip_bind, .connect = l2tp_ip_connect, .disconnect = l2tp_ip_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip_destroy_sock, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), }; static const struct proto_ops l2tp_ip_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static struct inet_protosw l2tp_ip_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, }; static __net_init int l2tp_ip_init_net(struct net *net) { struct l2tp_ip_net *pn = net_generic(net, l2tp_ip_net_id); rwlock_init(&pn->l2tp_ip_lock); INIT_HLIST_HEAD(&pn->l2tp_ip_table); INIT_HLIST_HEAD(&pn->l2tp_ip_bind_table); return 0; } static __net_exit void l2tp_ip_exit_net(struct net *net) { struct l2tp_ip_net *pn = l2tp_ip_pernet(net); write_lock_bh(&pn->l2tp_ip_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip_lock); } static struct pernet_operations l2tp_ip_net_ops = { .init = l2tp_ip_init_net, .exit = l2tp_ip_exit_net, .id = &l2tp_ip_net_id, .size = sizeof(struct l2tp_ip_net), }; static int __init l2tp_ip_init(void) { int err; pr_info("L2TP IP encapsulation support (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip_net_ops); if (err) goto out; err = proto_register(&l2tp_ip_prot, 1); if (err != 0) goto out1; err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); if (err) goto out2; inet_register_protosw(&l2tp_ip_protosw); return 0; out2: proto_unregister(&l2tp_ip_prot); out1: unregister_pernet_device(&l2tp_ip_net_ops); out: return err; } static void __exit l2tp_ip_exit(void) { inet_unregister_protosw(&l2tp_ip_protosw); inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip_prot); unregister_pernet_device(&l2tp_ip_net_ops); } module_init(l2tp_ip_init); module_exit(l2tp_ip_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115); |
38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM smc #if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SMC_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include "smc.h" #include "smc_core.h" TRACE_EVENT(smc_switch_to_fallback, TP_PROTO(const struct smc_sock *smc, int fallback_rsn), TP_ARGS(smc, fallback_rsn), TP_STRUCT__entry( __field(const void *, sk) __field(const void *, clcsk) __field(u64, net_cookie) __field(int, fallback_rsn) ), TP_fast_assign( const struct sock *sk = &smc->sk; const struct sock *clcsk = smc->clcsock->sk; __entry->sk = sk; __entry->clcsk = clcsk; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->fallback_rsn = fallback_rsn; ), TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", __entry->sk, __entry->clcsk, __entry->net_cookie, __entry->fallback_rsn) ); DECLARE_EVENT_CLASS(smc_msg_event, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len), TP_STRUCT__entry( __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) __string(name, smc->conn.lnk->ibname) ), TP_fast_assign( const struct sock *sk = &smc->sk; __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", __entry->smc, __entry->net_cookie, __entry->len, __get_str(name)) ); DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); TRACE_EVENT(smcr_link_down, TP_PROTO(const struct smc_link *lnk, void *location), TP_ARGS(lnk, location), TP_STRUCT__entry( __field(const void *, lnk) __field(const void *, lgr) __field(u64, net_cookie) __field(int, state) __string(name, lnk->ibname) __field(void *, location) ), TP_fast_assign( const struct smc_link_group *lgr = lnk->lgr; __entry->lnk = lnk; __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; __assign_str(name); __entry->location = location; ), TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->net_cookie, __entry->state, __get_str(name), __entry->location) ); #endif /* _TRACE_SMC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE smc_tracepoint #include <trace/define_trace.h> |
62 16 62 28 28 23 23 7 28 28 11 21 355 698 355 696 355 352 355 62 355 329 33 190 697 568 190 190 61 189 190 698 698 696 698 697 694 697 697 697 696 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 | #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; /* Limit batching if we have delayed rmaps pending */ if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { struct encoded_page **pages = batch->encoded_pages; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); unsigned int nr_pages = 1; if (unlikely(encoded_page_flags(enc) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages = encoded_nr_pages(pages[++i]); folio_remove_rmap_ptes(page_folio(page), page, nr_pages, vma); } } } /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so * we only need to walk through the current active batch and the * original local one. */ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (!tlb->delayed_rmap) return; tlb_flush_rmap_batch(&tlb->local, vma); if (tlb->active != &tlb->local) tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif /* * We might end up freeing a lot of pages. Reschedule on a regular * basis to avoid soft lockups in configurations without full * preemption enabled. The magic number of 512 folios seems to work. */ #define MAX_NR_FOLIOS_PER_FREE 512 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { struct encoded_page **pages = batch->encoded_pages; unsigned int nr, nr_pages; while (batch->nr) { if (!page_poisoning_enabled_static() && !want_init_on_free()) { nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave * nr_pages behind when capping the number of entries. */ if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; } else { /* * With page poisoning and init_on_free, the time it * takes to free memory grows proportionally with the * actual memory size. Therefore, limit based on the * actual memory size and not the number of involved * folios. */ for (nr = 0, nr_pages = 0; nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; nr++) { if (unlikely(encoded_page_flags(pages[nr]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages += encoded_nr_pages(pages[++nr]); else nr_pages++; } } free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); } } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap, int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ if (likely(nr_pages == 1)) { batch->encoded_pages[batch->nr++] = encode_page(page, flags); } else { flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. */ if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) { return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, PAGE_SIZE); } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Architectures that do not have this (PPC) need to delay the freeing by some * other means, this is that means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } #ifdef CONFIG_PT_RECLAIM static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) { struct ptdesc *ptdesc; ptdesc = container_of(head, struct ptdesc, pt_rcu_head); __tlb_remove_table(ptdesc); } static inline void __tlb_remove_table_one(void *table) { struct ptdesc *ptdesc; ptdesc = table; call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); } #else static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } #endif /* CONFIG_PT_RECLAIM */ static void tlb_remove_table_one(void *table) { __tlb_remove_table_one(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); } |
18 16 10 2 8 8 6 6 1 2 3 1 2 2 4 4 4 12 1 2 8 1 8 2 4 6 1 10 8 2 2 10 16 16 11 16 5 11 5 11 1 1 11 6 16 6 8 15 15 5 5 4 9 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tcp; int copied; if (!skb || !skb->len || !sk_is_tcp(sk)) return; if (skb_bpf_strparser(skb)) return; tcp = tcp_sk(sk); copied = tcp->copied_seq + skb->len; WRITE_ONCE(tcp->copied_seq, copied); tcp_rcv_space_adjust(sk); __tcp_cleanup_rbuf(sk, skb->len); } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; u32 size, copied = 0; struct sk_msg *tmp; int i, ret = 0; tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; lock_sock(sk); tmp->sg.start = msg->sg.start; i = msg->sg.start; do { sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; } sk_mem_charge(sk, size); atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) get_page(sk_msg_page(tmp, i)); sk_msg_iter_var_next(i); tmp->sg.end = i; if (apply) { apply_bytes -= size; if (!apply_bytes) { if (sge->length) sk_msg_iter_var_prev(i); break; } } } while (i != msg->sg.end); if (!ret) { msg->sg.start = i; if (!sk_psock_queue_msg(psock, tmp)) atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); } release_sock(sk); return ret; } static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { struct msghdr msghdr = {}; bool apply = apply_bytes; struct scatterlist *sge; struct page *page; int size, ret = 0; u32 off; while (1) { struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; off = sge->offset; page = sg_page(sge); tcp_rate_check_app_limited(sk); retry: msghdr.msg_flags = flags | MSG_SPLICE_PAGES; has_tx_ulp = tls_sw_has_ctx_tx(sk); if (has_tx_ulp) msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; if (size < sge->length && msg->sg.start != msg->sg.end) msghdr.msg_flags |= MSG_MORE; bvec_set_page(&bvec, page, size, off); iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; if (apply) apply_bytes -= ret; msg->sg.size -= ret; sge->offset += ret; sge->length -= ret; if (uncharge) sk_mem_uncharge(sk, ret); if (ret != size) { size -= ret; off += ret; goto retry; } if (!sge->length) { put_page(page); sk_msg_iter_next(msg, start); sg_init_table(sge, 1); if (msg->sg.start == msg->sg.end) break; } if (apply && !apply_bytes) break; } return 0; } static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { int ret; lock_sock(sk); ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); release_sock(sk); return ret; } int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags) { struct sk_psock *psock = sk_psock_get(sk); int ret; if (unlikely(!psock)) return -EPIPE; ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static bool is_next_msg_fin(struct sk_psock *psock) { struct scatterlist *sge; struct sk_msg *msg_rx; int i; msg_rx = sk_psock_peek_msg(psock); i = msg_rx->sg.start; sge = sk_msg_elem(msg_rx, i); if (!sge->length) { struct sk_buff *skb = msg_rx->skb; if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) return true; } return false; } static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int peek = flags & MSG_PEEK; struct sk_psock *psock; struct tcp_sock *tcp; int copied = 0; u32 seq; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); tcp = tcp_sk(sk); seq = tcp->copied_seq; /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't * assigned a sk_socket yet so have no link to the ops. The work-around * is to check the sk_receive_queue and in these cases read skbs off * queue again. The read_skb hook is not running at this point because * of lock_sock so we avoid having multiple runners in read_skb. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { tcp_data_ready(sk); /* This handles the ENOMEM errors if we both receive data * pre accept and are already under memory pressure. At least * let user know to retry. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { copied = -EAGAIN; goto out; } } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. * On fin return correct return code to zero. */ if (copied == -EFAULT) { bool is_fin = is_next_msg_fin(psock); if (is_fin) { copied = 0; seq++; goto out; } } seq += copied; if (!copied) { long timeo; int data; if (sock_flag(sk, SOCK_DONE)) goto out; if (sk->sk_err) { copied = sock_error(sk); goto out; } if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!timeo) { copied = -EAGAIN; goto out; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); goto out; } data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { copied = data; goto unlock; } if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: if (!peek) WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); unlock: release_sock(sk); sk_psock_put(sk, psock); return copied; } static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { ret = data; goto unlock; } if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } copied = -EAGAIN; } ret = copied; unlock: release_sock(sk); sk_psock_put(sk, psock); return ret; } static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; int ret; more_data: if (psock->eval == __SK_NONE) { /* Track delta in msg size to add/subtract it on SK_DROP from * returned to user copied size. This ensures user doesn't * get a positive return code with msg_cut_data and SK_DROP * verdict. */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { psock->cork_bytes = msg->cork_bytes - msg->sg.size; if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); if (!psock->cork) return -ENOMEM; } memcpy(psock->cork, msg, sizeof(*msg)); return 0; } tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: ret = tcp_bpf_push(sk, msg, tosend, flags, true); if (unlikely(ret)) { *copied -= sk_msg_free(sk, msg); break; } sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { /* Clean up before releasing the sock lock. */ eval = psock->eval; psock->eval = __SK_NONE; psock->sk_redir = NULL; } if (psock->cork) { cork = true; psock->cork = NULL; } release_sock(sk); origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); lock_sock(sk); sk_mem_uncharge(sk, sent); if (unlikely(ret < 0)) { int free = sk_msg_free(sk, msg); if (!cork) *copied -= free; } if (cork) { sk_msg_free(sk, msg); kfree(msg); msg = NULL; ret = 0; } break; case __SK_DROP: default: sk_msg_free(sk, msg); sk_msg_apply_bytes(psock, tosend); *copied -= (tosend + delta); return -EACCES; } if (likely(!ret)) { if (!psock->apply_bytes) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (msg && msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) goto more_data; } return ret; } static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_sendmsg(sk, msg, size); lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { bool enospc = false; u32 copy, osize; if (sk->sk_err) { err = -sk->sk_err; goto out_err; } copy = msg_data_left(msg); if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; if (psock->cork) { msg_tx = psock->cork; } else { msg_tx = &tmp; sk_msg_init(msg_tx); } osize = msg_tx->sg.size; err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); if (err) { if (err != -ENOSPC) goto wait_for_memory; enospc = true; copy = msg_tx->sg.size - osize; } ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; else psock->cork_bytes -= size; if (psock->cork_bytes && !enospc) goto out_err; /* All cork bytes are accounted, rerun the prog. */ psock->eval = __SK_NONE; psock->cork_bytes = 0; } err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); if (unlikely(err < 0)) goto out_err; continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (msg_tx && msg_tx != psock->cork) sk_msg_free(sk, msg_tx); goto out_err; } } out_err: if (err < 0) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); return copied > 0 ? copied : err; } enum { TCP_BPF_IPV4, TCP_BPF_IPV6, TCP_BPF_NUM_PROTS, }; enum { TCP_BPF_BASE, TCP_BPF_TX, TCP_BPF_RX, TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; static struct proto *tcpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); smp_store_release(&tcpv6_prot_saved, ops); } spin_unlock_bh(&tcpv6_prot_lock); } } static int __init tcp_bpf_v4_build_proto(void) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call * into ops if e.g. a psock is not present. Make sure they are * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sock *sk = strp->sk; struct sk_psock *psock; struct tcp_sock *tp; int copied = 0; tp = tcp_sk(sk); rcu_read_lock(); psock = sk_psock(sk); if (WARN_ON_ONCE(!psock)) { desc->error = -EINVAL; goto out; } psock->ingress_bytes = 0; copied = tcp_read_sock_noack(sk, desc, recv_actor, true, &psock->copied_seq); if (copied < 0) goto out; /* recv_actor may redirect skb to another socket (SK_REDIRECT) or * just put skb into ingress queue of current socket (SK_PASS). * For SK_REDIRECT, we need to ack the frame immediately but for * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). */ tp->copied_seq = psock->copied_seq - psock->ingress_bytes; tcp_rcv_space_adjust(sk); __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); out: rcu_read_unlock(); return copied; } #endif /* CONFIG_BPF_STREAM_PARSER */ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (psock->progs.stream_verdict || psock->progs.skb_verdict) { config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; } if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, * but we need to ensure we stop using the sock_map * unhash routine because the associated psock is being * removed. So use the original unhash handler. */ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } return 0; } if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to * the default ones because the child does not inherit the psock state * that tcp_bpf callbacks expect. */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { struct proto *prot = newsk->sk_prot; if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ |
132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 2003 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/types.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/export.h> #include <net/sctp/sctp.h> #include <net/ip.h> /* for snmp_fold_field */ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB), SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS), SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS), SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS), SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS), SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES), SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS), SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS), SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS), SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS), SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS), SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS), SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS), SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS), SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS), SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS), SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS), SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS), SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS), SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS), SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS), SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS), SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS), SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS), SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS), SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ), SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { unsigned long buff[SCTP_MIB_MAX]; struct net *net = seq->private; int i; memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); snmp_get_cpu_field_batch(buff, sctp_snmp_list, net->sctp.sctp_statistics); for (i = 0; sctp_snmp_list[i].name; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); return 0; } /* Dump local addresses of an association/endpoint. */ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) { struct sctp_association *asoc; struct sctp_sockaddr_entry *laddr; struct sctp_transport *peer; union sctp_addr *addr, *primary = NULL; struct sctp_af *af; if (epb->type == SCTP_EP_TYPE_ASSOCIATION) { asoc = sctp_assoc(epb); peer = asoc->peer.primary_path; if (unlikely(peer == NULL)) { WARN(1, "Association %p with NULL primary path!\n", asoc); return; } primary = &peer->saddr; } rcu_read_lock(); list_for_each_entry_rcu(laddr, &epb->bind_addr.address_list, list) { if (!laddr->valid) continue; addr = &laddr->a; af = sctp_get_af_specific(addr->sa.sa_family); if (primary && af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } rcu_read_unlock(); } /* Dump remote addresses of an association. */ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc) { struct sctp_transport *transport; union sctp_addr *addr, *primary; struct sctp_af *af; primary = &assoc->peer.primary_addr; list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list, transports) { addr = &transport->ipaddr; af = sctp_get_af_specific(addr->sa.sa_family); if (af->cmp_addr(addr, primary)) { seq_printf(seq, "*"); } af->seq_dump_addr(seq, addr); } } static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos >= sctp_ep_hashsize) return NULL; if (*pos < 0) *pos = 0; if (*pos == 0) seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n"); return (void *)pos; } static void sctp_eps_seq_stop(struct seq_file *seq, void *v) { } static void *sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos) { if (++*pos >= sctp_ep_hashsize) return NULL; return pos; } /* Display sctp endpoints (/proc/net/sctp/eps). */ static int sctp_eps_seq_show(struct seq_file *seq, void *v) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; int hash = *(loff_t *)v; if (hash >= sctp_ep_hashsize) return -ENOMEM; head = &sctp_ep_hashtable[hash]; read_lock_bh(&head->lock); sctp_for_each_hentry(ep, &head->chain) { sk = ep->base.sk; if (!net_eq(sock_net(sk), seq_file_net(seq))) continue; seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, &ep->base); seq_printf(seq, "\n"); } read_unlock_bh(&head->lock); return 0; } static const struct seq_operations sctp_eps_ops = { .start = sctp_eps_seq_start, .next = sctp_eps_seq_next, .stop = sctp_eps_seq_stop, .show = sctp_eps_seq_show, }; struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; }; static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; sctp_transport_walk_start(&iter->hti); return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); } static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } sctp_transport_walk_stop(&iter->hti); } static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; if (v && v != SEQ_START_TOKEN) { struct sctp_transport *transport = v; sctp_transport_put(transport); } ++*pos; return sctp_transport_get_next(seq_file_net(seq), &iter->hti); } /* Display sctp associations (/proc/net/sctp/assocs). */ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) { struct sctp_transport *transport; struct sctp_association *assoc; struct sctp_ep_common *epb; struct sock *sk; if (v == SEQ_START_TOKEN) { seq_printf(seq, " ASSOC SOCK STY SST ST HBKT " "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT " "RPORT LADDRS <-> RADDRS " "HBINT INS OUTS MAXRT T1X T2X RTXC " "wmema wmemq sndbuf rcvbuf\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; epb = &assoc->base; sk = epb->sk; seq_printf(seq, "%8pK %8pK %-3d %-3d %-2d %-4d " "%4d %8d %8d %7u %5lu %-5d %5d ", assoc, sk, sctp_sk(sk)->type, sk->sk_state, assoc->state, 0, assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); seq_printf(seq, " "); sctp_seq_dump_local_addrs(seq, epb); seq_printf(seq, "<-> "); sctp_seq_dump_remote_addrs(seq, assoc); seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d " "%8d %8d %8d %8d", assoc->hbinterval, assoc->stream.incnt, assoc->stream.outcnt, assoc->max_retrans, assoc->init_retries, assoc->shutdown_retries, assoc->rtx_data_chunks, refcount_read(&sk->sk_wmem_alloc), READ_ONCE(sk->sk_wmem_queued), sk->sk_sndbuf, sk->sk_rcvbuf); seq_printf(seq, "\n"); return 0; } static const struct seq_operations sctp_assoc_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_assocs_seq_show, }; static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " "REM_ADDR_RTX START STATE\n"); return 0; } transport = (struct sctp_transport *)v; assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { /* * The remote address (ADDR) */ tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr); seq_printf(seq, " "); /* * The association ID (ASSOC_ID) */ seq_printf(seq, "%d ", tsp->asoc->assoc_id); /* * If the Heartbeat is active (HB_ACT) * Note: 1 = Active, 0 = Inactive */ seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer)); /* * Retransmit time out (RTO) */ seq_printf(seq, "%lu ", tsp->rto); /* * Maximum path retransmit count (PATH_MAX_RTX) */ seq_printf(seq, "%d ", tsp->pathmaxrxt); /* * remote address retransmit count (REM_ADDR_RTX) * Note: We don't have a way to tally this at the moment * so lets just leave it as zero for the moment */ seq_puts(seq, "0 "); /* * remote address start time (START). This is also not * currently implemented, but we can record it with a * jiffies marker in a subsequent patch */ seq_puts(seq, "0 "); /* * The current state of this destination. I.e. * SCTP_ACTIVE, SCTP_INACTIVE, ... */ seq_printf(seq, "%d", tsp->state); seq_printf(seq, "\n"); } return 0; } static const struct seq_operations sctp_remaddr_ops = { .start = sctp_transport_seq_start, .next = sctp_transport_seq_next, .stop = sctp_transport_seq_stop, .show = sctp_remaddr_seq_show, }; /* Set up the proc fs entry for the SCTP protocol. */ int __net_init sctp_proc_init(struct net *net) { net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); if (!net->sctp.proc_net_sctp) return -ENOMEM; if (!proc_create_net_single("snmp", 0444, net->sctp.proc_net_sctp, sctp_snmp_seq_show, NULL)) goto cleanup; if (!proc_create_net("eps", 0444, net->sctp.proc_net_sctp, &sctp_eps_ops, sizeof(struct seq_net_private))) goto cleanup; if (!proc_create_net("assocs", 0444, net->sctp.proc_net_sctp, &sctp_assoc_ops, sizeof(struct sctp_ht_iter))) goto cleanup; if (!proc_create_net("remaddr", 0444, net->sctp.proc_net_sctp, &sctp_remaddr_ops, sizeof(struct sctp_ht_iter))) goto cleanup; return 0; cleanup: remove_proc_subtree("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; return -ENOMEM; } |
7 60 11 60 9 11 10 11 10 8 11 134 101 60 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | // SPDX-License-Identifier: GPL-2.0 /* * lib/minmax.c: windowed min/max tracker * * Kathleen Nichols' algorithm for tracking the minimum (or maximum) * value of a data stream over some fixed time interval. (E.g., * the minimum RTT over the past five minutes.) It uses constant * space and constant time per update yet almost always delivers * the same minimum as an implementation that has to keep all the * data in the window. * * The algorithm keeps track of the best, 2nd best & 3rd best min * values, maintaining an invariant that the measurement time of * the n'th best >= n-1'th best. It also makes sure that the three * values are widely separated in the time window since that bounds * the worse case error when that data is monotonically increasing * over the window. * * Upon getting a new min, we can forget everything earlier because * it has no value - the new min is <= everything else in the window * by definition and it's the most recent. So we restart fresh on * every new min and overwrites 2nd & 3rd choices. The same property * holds for 2nd & 3rd best. */ #include <linux/module.h> #include <linux/win_minmax.h> /* As time advances, update the 1st, 2nd, and 3rd choices. */ static u32 minmax_subwin_update(struct minmax *m, u32 win, const struct minmax_sample *val) { u32 dt = val->t - m->s[0].t; if (unlikely(dt > win)) { /* * Passed entire window without a new val so make 2nd * choice the new val & 3rd choice the new 2nd choice. * we may have to iterate this since our 2nd choice * may also be outside the window (we checked on entry * that the third choice was in the window). */ m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; if (unlikely(val->t - m->s[0].t > win)) { m->s[0] = m->s[1]; m->s[1] = m->s[2]; m->s[2] = *val; } } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { /* * We've passed a quarter of the window without a new val * so take a 2nd choice from the 2nd quarter of the window. */ m->s[2] = m->s[1] = *val; } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { /* * We've passed half the window without finding a new val * so take a 3rd choice from the last half of the window */ m->s[2] = *val; } return m->s[0].v; } /* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v >= m->s[0].v) || /* found new max? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v >= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v >= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_max); /* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; if (unlikely(val.v <= m->s[0].v) || /* found new min? */ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ return minmax_reset(m, t, meas); /* forget earlier samples */ if (unlikely(val.v <= m->s[1].v)) m->s[2] = m->s[1] = val; else if (unlikely(val.v <= m->s[2].v)) m->s[2] = val; return minmax_subwin_update(m, win, &val); } EXPORT_SYMBOL(minmax_running_min); |
28 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | #ifndef __LINUX_ERSPAN_H #define __LINUX_ERSPAN_H /* * GRE header for ERSPAN type I encapsulation (4 octets [34:37]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|0|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * The Type I ERSPAN frame format is based on the barebones IP + GRE * encapsulation (as described above) on top of the raw mirrored frame. * There is no extra ERSPAN header. * * * GRE header for ERSPAN type II and II encapsulation (8 octets [34:41]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|1|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sequence Number (increments per packet per session) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note that in the above GRE header [RFC1701] out of the C, R, K, S, * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The * other fields are set to zero, so only a sequence number follows. * * ERSPAN Version 1 (Type II) header (8 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS | En|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Reserved | Index | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * ERSPAN Version 2 (Type III) header (12 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS |BSO|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Timestamp | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SGT |P| FT | Hw ID |D|Gra|O| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Platform Specific SubHeader (8 octets, optional) * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platf ID | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB */ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/skbuff.h> #include <uapi/linux/erspan.h> #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff #define COS_MASK 0xe000 #define EN_MASK 0x1800 #define T_MASK 0x0400 #define ID_MASK 0x03ff #define INDEX_MASK 0xfffff #define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ #define BSO_MASK EN_MASK #define SGT_MASK 0xffff0000 #define P_MASK 0x8000 #define FT_MASK 0x7c00 #define HWID_MASK 0x03f0 #define DIR_MASK 0x0008 #define GRA_MASK 0x0006 #define O_MASK 0x0001 #define HWID_OFFSET 4 #define DIR_OFFSET 3 enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag preserved in frame */ }; #define ERSPAN_V1_MDSIZE 4 #define ERSPAN_V2_MDSIZE 8 struct erspan_base_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 vlan_upper:4, ver:4; __u8 vlan:8; __u8 session_id_upper:2, t:1, en:2, cos:3; __u8 session_id:8; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 ver: 4, vlan_upper:4; __u8 vlan:8; __u8 cos:3, en:2, t:1, session_id_upper:2; __u8 session_id:8; #else #error "Please fix <asm/byteorder.h>" #endif }; static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id) { ershdr->session_id = id & 0xff; ershdr->session_id_upper = (id >> 8) & 0x3; } static inline u16 get_session_id(const struct erspan_base_hdr *ershdr) { return (ershdr->session_id_upper << 8) + ershdr->session_id; } static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan) { ershdr->vlan = vlan & 0xff; ershdr->vlan_upper = (vlan >> 8) & 0xf; } static inline u16 get_vlan(const struct erspan_base_hdr *ershdr) { return (ershdr->vlan_upper << 8) + ershdr->vlan; } static inline void set_hwid(struct erspan_md2 *md2, u8 hwid) { md2->hwid = hwid & 0xf; md2->hwid_upper = (hwid >> 4) & 0x3; } static inline u8 get_hwid(const struct erspan_md2 *md2) { return (md2->hwid_upper << 4) + md2->hwid; } static inline int erspan_hdr_len(int version) { if (version == 0) return 0; return sizeof(struct erspan_base_hdr) + (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); } static inline u8 tos_to_cos(u8 tos) { u8 dscp, cos; dscp = tos >> 2; cos = dscp >> 3; return cos; } static inline void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 tos; __be32 *idx; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); enc_type = ERSPAN_ENCAP_NOVLAN; /* If mirrored packet has vlan tag, extract tci and * preserve vlan header in the mirrored frame. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); enc_type = ERSPAN_ENCAP_INFRAME; } skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION; ershdr->cos = tos_to_cos(tos); ershdr->en = enc_type; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ idx = (__be32 *)(ershdr + 1); *idx = htonl(index & INDEX_MASK); } /* ERSPAN GRA: timestamp granularity * 00b --> granularity = 100 microseconds * 01b --> granularity = 100 nanoseconds * 10b --> granularity = IEEE 1588 * Here we only support 100 microseconds. */ static inline __be32 erspan_get_timestamp(void) { u64 h_usecs; ktime_t kt; kt = ktime_get_real(); h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); /* ERSPAN base header only has 32-bit, * so it wraps around 4 days. */ return htonl((u32)h_usecs); } /* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 * 00b --> Good frame with no error, or unknown integrity * 01b --> Payload is a Short Frame * 10b --> Payload is an Oversized Frame * 11b --> Payload is a Bad Frame with CRC or Alignment Error */ enum erspan_bso { BSO_NOERROR = 0x0, BSO_SHORT = 0x1, BSO_OVERSIZED = 0x2, BSO_BAD = 0x3, }; static inline u8 erspan_detect_bso(struct sk_buff *skb) { /* BSO_BAD is not handled because the frame CRC * or alignment error information is in FCS. */ if (skb->len < ETH_ZLEN) return BSO_SHORT; if (skb->len > ETH_FRAME_LEN) return BSO_OVERSIZED; return BSO_NOERROR; } static inline void erspan_build_header_v2(struct sk_buff *skb, u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; struct erspan_md2 *md2; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 gra = 0; /* 100 usec */ u8 bso = 0; /* Bad/Short/Oversized */ u8 sgt = 0; u8 tos; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); /* Unlike v1, v2 does not have En field, * so only extract vlan tci field. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); } bso = erspan_detect_bso(skb); skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION2; ershdr->cos = tos_to_cos(tos); ershdr->en = bso; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ md2 = (struct erspan_md2 *)(ershdr + 1); md2->timestamp = erspan_get_timestamp(); md2->sgt = htons(sgt); md2->p = 1; md2->ft = 0; md2->dir = direction; md2->gra = gra; md2->o = 0; set_hwid(md2, hwid); } #endif |
23 5 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Framework and drivers for configuring and reading different PHYs * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #ifndef __PHY_H #define __PHY_H #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/leds.h> #include <linux/linkmode.h> #include <linux/netlink.h> #include <linux/mdio.h> #include <linux/mii.h> #include <linux/mii_timestamper.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> #include <linux/irqreturn.h> #include <linux/iopoll.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <net/eee.h> extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) #define PHY_BASIC_T1S_P2MP_FEATURES ((unsigned long *)&phy_basic_t1s_p2mp_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) #define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, * or not desired for this PHY. Set to PHY_MAC_INTERRUPT if * the attached MAC driver handles the interrupt */ #define PHY_POLL -1 #define PHY_MAC_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 #define PHY_POLL_CABLE_TEST 0x00000004 #define PHY_ALWAYS_CALL_SUSPEND 0x00000008 #define MDIO_DEVICE_IS_PHY 0x80000000 /** * enum phy_interface_t - Interface Mode definitions * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined * @PHY_INTERFACE_MODE_MII: Media-independent interface * @PHY_INTERFACE_MODE_GMII: Gigabit media-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface * @PHY_INTERFACE_MODE_REVRMII: Reduced Media Independent Interface in PHY role * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_PSGMII: Penta SGMII * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_10G_QXGMII: 10G-QXGMII - 4 ports over 10G USXGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_SGMII, PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_REVMII, PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_REVRMII, PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID, PHY_INTERFACE_MODE_RGMII_TXID, PHY_INTERFACE_MODE_RTBI, PHY_INTERFACE_MODE_SMII, PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_XLGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_PSGMII, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_100BASEX, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_5GBASER, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_25GBASER, PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_10G_QXGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; /* PHY interface mode bitmap handling */ #define DECLARE_PHY_INTERFACE_MASK(name) \ DECLARE_BITMAP(name, PHY_INTERFACE_MODE_MAX) static inline void phy_interface_zero(unsigned long *intf) { bitmap_zero(intf, PHY_INTERFACE_MODE_MAX); } static inline bool phy_interface_empty(const unsigned long *intf) { return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_and(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_and(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_or(unsigned long *dst, const unsigned long *a, const unsigned long *b) { bitmap_or(dst, a, b, PHY_INTERFACE_MODE_MAX); } static inline void phy_interface_set_rgmii(unsigned long *intf) { __set_bit(PHY_INTERFACE_MODE_RGMII, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_ID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_RXID, intf); __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); } /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { switch (interface) { case PHY_INTERFACE_MODE_NA: return ""; case PHY_INTERFACE_MODE_INTERNAL: return "internal"; case PHY_INTERFACE_MODE_MII: return "mii"; case PHY_INTERFACE_MODE_GMII: return "gmii"; case PHY_INTERFACE_MODE_SGMII: return "sgmii"; case PHY_INTERFACE_MODE_TBI: return "tbi"; case PHY_INTERFACE_MODE_REVMII: return "rev-mii"; case PHY_INTERFACE_MODE_RMII: return "rmii"; case PHY_INTERFACE_MODE_REVRMII: return "rev-rmii"; case PHY_INTERFACE_MODE_RGMII: return "rgmii"; case PHY_INTERFACE_MODE_RGMII_ID: return "rgmii-id"; case PHY_INTERFACE_MODE_RGMII_RXID: return "rgmii-rxid"; case PHY_INTERFACE_MODE_RGMII_TXID: return "rgmii-txid"; case PHY_INTERFACE_MODE_RTBI: return "rtbi"; case PHY_INTERFACE_MODE_SMII: return "smii"; case PHY_INTERFACE_MODE_XGMII: return "xgmii"; case PHY_INTERFACE_MODE_XLGMII: return "xlgmii"; case PHY_INTERFACE_MODE_MOCA: return "moca"; case PHY_INTERFACE_MODE_PSGMII: return "psgmii"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; case PHY_INTERFACE_MODE_TRGMII: return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; case PHY_INTERFACE_MODE_1000BASEKX: return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: return "5gbase-r"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; case PHY_INTERFACE_MODE_10GBASER: return "10gbase-r"; case PHY_INTERFACE_MODE_25GBASER: return "25gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; case PHY_INTERFACE_MODE_QUSGMII: return "qusgmii"; case PHY_INTERFACE_MODE_10G_QXGMII: return "10g-qxgmii"; default: return "unknown"; } } /** * rgmii_clock - map link speed to the clock rate * @speed: link speed value * * Description: maps RGMII supported link speeds * into the clock rates. * * Returns: clock rate or negative errno */ static inline long rgmii_clock(int speed) { switch (speed) { case SPEED_10: return 2500000; case SPEED_100: return 25000000; case SPEED_1000: return 125000000; default: return -EINVAL; } } #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" #define MII_BUS_ID_SIZE 61 struct device; struct kernel_hwtstamp_config; struct phylink; struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; /** * struct mdio_bus_stats - Statistics counters for MDIO busses * @transfers: Total number of transfers, i.e. @writes + @reads * @errors: Number of MDIO transfers that returned an error * @writes: Number of write transfers * @reads: Number of read transfers * @syncp: Synchronisation for incrementing statistics */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; u64_stats_t writes; u64_stats_t reads; /* Must be last, add new statistics above */ struct u64_stats_sync syncp; }; /** * struct phy_package_shared - Shared information in PHY packages * @base_addr: Base PHY address of PHY package used to combine PHYs * in one package and for offset calculation of phy_package_read/write * @np: Pointer to the Device Node if PHY package defined in DT * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv * @priv: Driver private data shared across a PHY package * * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ struct phy_package_shared { u8 base_addr; /* With PHY package defined in DT this points to the PHY package node */ struct device_node *np; refcount_t refcnt; unsigned long flags; size_t priv_size; /* private data pointer */ /* note that this pointer is shared between different phydevs and * the user has to take care of appropriate locking. It is allocated * and freed automatically by phy_package_join() and * phy_package_leave(). */ void *priv; }; /* used as bit number in atomic bitops */ #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 /** * struct mii_bus - Represents an MDIO bus * * @owner: Who owns this device * @name: User friendly name for this MDIO device, or driver name * @id: Unique identifier for this bus, typical from bus hierarchy * @priv: Driver private data * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ struct mii_bus { struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); /** @read_c45: Perform a C45 read transfer on the bus */ int (*read_c45)(struct mii_bus *bus, int addr, int devnum, int regnum); /** @write_c45: Perform a C45 write transfer on the bus */ int (*write_c45)(struct mii_bus *bus, int addr, int devnum, int regnum, u16 val); /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; /** * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; /** @parent: Parent device of this bus */ struct device *parent; /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; /** @dev: Kernel device representation */ struct device dev; /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; /** * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) struct mii_bus *mdiobus_alloc_size(size_t size); /** * mdiobus_alloc - Allocate an MDIO bus structure * * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready * for the driver to register the bus. */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); } int __mdiobus_register(struct mii_bus *bus, struct module *owner); int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) #define devm_mdiobus_register(dev, bus) \ __devm_mdiobus_register(dev, bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) { return devm_mdiobus_alloc_size(dev, 0); } struct mii_bus *mdio_find_bus(const char *mdio_name); struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true /** * enum phy_state - PHY state machine states: * * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. * - PHY driver probe function will set the state to @PHY_READY * * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. * - timer moves to @PHY_NOLINK or @PHY_RUNNING * * @PHY_NOLINK: PHY is up, but not currently plugged in. * - irq or timer will set @PHY_RUNNING if link comes back * - phy_stop moves to @PHY_HALTED * * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets * - irq or timer will set @PHY_NOLINK if link goes down * - phy_stop moves to @PHY_HALTED * * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. * - phy_stop aborts the running test and moves to @PHY_HALTED * * @PHY_HALTED: PHY is up, but no polling or interrupts are done. * - phy_start moves to @PHY_UP * * @PHY_ERROR: PHY is up, but is in an error state. * - phy_stop moves to @PHY_HALTED */ enum phy_state { PHY_DOWN = 0, PHY_READY, PHY_HALTED, PHY_ERROR, PHY_UP, PHY_RUNNING, PHY_NOLINK, PHY_CABLETEST, }; #define MDIO_MMD_NUM 32 /** * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. * @device_ids: The device identifer for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; u32 mmds_present; u32 device_ids[MDIO_MMD_NUM]; }; struct macsec_context; struct macsec_ops; /** * struct phy_device - An instance of a PHY * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. * @phyindex: Unique id across the phy's parent tree of phys to address the PHY * from userspace, similar to ifindex. A zero index means the PHY * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. * @is_internal: Set to true if this PHY is internal to a MAC. * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. * @is_gigabit_capable: Set to true if PHY supports 1000Mbps * @has_fixups: Set to true if this PHY has fixups/quirks. * @suspended: Set to true if this PHY has been suspended successfully. * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @is_on_sfp_module: Set true if PHY is located on an SFP module. * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @wol_enabled: Set to true if the PHY or the attached MAC have Wake-on-LAN * enabled. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * * - Bits [15:0] are free to use by the PHY driver to communicate * driver specific behavior. * - Bits [23:16] are currently reserved for future use. * - Bits [31:24] are reserved for defining generic * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port * @attached_dev: The attached enet driver's device instance ptr * @adjust_link: Callback for the enet controller to respond to changes: in the * link state. * @phy_link_change: Callback for phylink for notification of link change * @macsec_ops: MACsec offloading ops. * * @speed: Current link speed * @duplex: Current duplex * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @eee_active: phylib private state, indicating that EEE has been negotiated * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_disabled_modes: Energy efficient ethernet modes not to be advertised * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled * @irq_suspended: Flag indicating PHY is suspended and therefore interrupt * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @default_timestamp: Flag indicating whether we are using the phy * timestamp as the default one * @interface: enum phy_interface_t value * @possible_interfaces: bitmap if interface modes that the attached PHY * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics * @phy_led_triggers: Array of LED triggers * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed * @leds: list of PHY LED structures * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling * and disabling specific interrupts * * Contains some infrastructure for polling and interrupt * handling, as well as handling shifts in PHY hardware state */ struct phy_device { struct mdio_device mdio; /* Information about the PHY type */ /* And management functions */ const struct phy_driver *drv; struct device_link *devlink; u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned suspended_by_mdio_bus:1; unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned is_on_sfp_module:1; unsigned mac_managed_pm:1; unsigned wol_enabled:1; unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; unsigned irq_suspended:1; unsigned irq_rerun:1; unsigned default_timestamp:1; int rate_matching; enum phy_state state; u32 dev_flags; phy_interface_t interface; DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) */ int speed; int duplex; int port; int pause; int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); /* Energy efficient ethernet modes which should be prohibited */ __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_disabled_modes); bool enable_tx_lpi; bool eee_active; struct eee_config eee_cfg; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; struct phy_led_trigger *led_link_trigger; #endif struct list_head leds; /* * Interrupt number for this PHY * -1 means no interrupt */ int irq; /* private data pointer */ /* For use by PHYs to maintain extra state */ void *priv; /* shared data pointer */ /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; /* Reporting cable test results */ struct sk_buff *skb; void *ehdr; struct nlattr *nest; /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; struct mutex lock; /* This may be modified under the rtnl lock */ bool sfp_bus_attached; struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; struct pse_control *psec; u8 mdix; u8 mdix_ctrl; int pma_extable; unsigned int link_down_events; void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif }; /* Generic phy_device::dev_flags */ #define PHY_F_NO_IRQ 0x80000000 #define PHY_F_RXC_ALWAYS_ON 0x40000000 static inline struct phy_device *to_phy_device(const struct device *dev) { return container_of(to_mdio_device(dev), struct phy_device, mdio); } /** * struct phy_tdr_config - Configuration of a TDR raw test * * @first: Distance for first data collection point * @last: Distance for last data collection point * @step: Step between data collection points * @pair: Bitmap of cable pairs to collect data for * * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. * All distances are in centimeters. */ struct phy_tdr_config { u32 first; u32 last; u32 step; s8 pair; }; #define PHY_PAIR_ALL -1 /** * enum link_inband_signalling - in-band signalling modes that are supported * * @LINK_INBAND_DISABLE: in-band signalling can be disabled * @LINK_INBAND_ENABLE: in-band signalling can be enabled without bypass * @LINK_INBAND_BYPASS: in-band signalling can be enabled with bypass * * The possible and required bits can only be used if the valid bit is set. * If possible is clear, that means inband signalling can not be used. * Required is only valid when possible is set, and means that inband * signalling must be used. */ enum link_inband_signalling { LINK_INBAND_DISABLE = BIT(0), LINK_INBAND_ENABLE = BIT(1), LINK_INBAND_BYPASS = BIT(2), }; /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @version: read-only PLCA register map version. -1 = not available. Ignored * when setting the configuration. Format is the same as reported by the PLCA * IDVER register (31.CA00). -1 = not available. * @enabled: PLCA configured mode (enabled/disabled). -1 = not available / don't * set. 0 = disabled, anything else = enabled. * @node_id: the PLCA local node identifier. -1 = not available / don't set. * Allowed values [0 .. 254]. 255 = node disabled. * @node_cnt: the PLCA node count (maximum number of nodes having a TO). Only * meaningful for the coordinator (node_id = 0). -1 = not available / don't * set. Allowed values [1 .. 255]. * @to_tmr: The value of the PLCA to_timer in bit-times, which determines the * PLCA transmit opportunity window opening. See IEEE802.3 Clause 148 for * more details. The to_timer shall be set equal over all nodes. * -1 = not available / don't set. Allowed values [0 .. 255]. * @burst_cnt: controls how many additional frames a node is allowed to send in * single transmit opportunity (TO). The default value of 0 means that the * node is allowed exactly one frame per TO. A value of 1 allows two frames * per TO, and so on. -1 = not available / don't set. * Allowed values [0 .. 255]. * @burst_tmr: controls how many bit times to wait for the MAC to send a new * frame before interrupting the burst. This value should be set to a value * greater than the MAC inter-packet gap (which is typically 96 bits). * -1 = not available / don't set. Allowed values [0 .. 255]. * * A structure containing configuration parameters for setting/getting the PLCA * RS configuration. The driver does not need to implement all the parameters, * but should report what is actually used. */ struct phy_plca_cfg { int version; int enabled; int node_id; int node_cnt; int to_tmr; int burst_cnt; int burst_tmr; }; /** * struct phy_plca_status - Status of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. * * @pst: The PLCA status as reported by the PST bit in the PLCA STATUS * register(31.CA03), indicating BEACON activity. * * A structure containing status information of the PLCA RS configuration. * The driver does not need to implement all the parameters, but should report * what is actually used. */ struct phy_plca_status { bool pst; }; /* Modes for PHY LED configuration */ enum phy_led_modes { PHY_LED_ACTIVE_HIGH = 0, PHY_LED_ACTIVE_LOW = 1, PHY_LED_INACTIVE_HIGH_IMPEDANCE = 2, /* keep it last */ __PHY_LED_MODES_NUM, }; /** * struct phy_led: An LED driven by the PHY * * @list: List of LEDs * @phydev: PHY this LED is attached to * @led_cdev: Standard LED class structure * @index: Number of the LED */ struct phy_led { struct list_head list; struct phy_device *phydev; struct led_classdev led_cdev; u8 index; }; #define to_phy_led(d) container_of(d, struct phy_led, led_cdev) /** * struct phy_driver - Driver structure for a particular PHY type * * @mdiodrv: Data common to all MDIO devices * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field * @name: The friendly name of this PHY type * @phy_id_mask: Defines the important bits of the phy_id * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. * Note that none of these functions should be called from * interrupt time. The goal is for the bus read/write functions * to be able to block when the bus transaction is happening, * and be freed up by an interrupt (The MPC85xx has this ability, * though it is not currently supported in the driver). */ struct phy_driver { struct mdio_driver_common mdiodrv; u32 phy_id; char *name; u32 phy_id_mask; const unsigned long * const features; u32 flags; const void *driver_data; /** * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); /** * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); /** * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); /** * @get_features: Probe the hardware to determine what * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /** * @inband_caps: query whether in-band is supported for the given PHY * interface mode. Returns a bitmask of bits defined by enum * link_inband_signalling. */ unsigned int (*inband_caps)(struct phy_device *phydev, phy_interface_t interface); /** * @config_inband: configure in-band mode for the PHY */ int (*config_inband)(struct phy_device *phydev, unsigned int modes); /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine * whether to advertise lower-speed modes for that interface. It is * assumed that if a rate matching mode is supported on an interface, * then that interface's rate can be adapted to all slower link speeds * supported by the phy. If the interface is not supported, this should * return %RATE_MATCH_NONE. */ int (*get_rate_matching)(struct phy_device *phydev, phy_interface_t iface); /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); /** * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); /** * @config_intr: Enables or disables interrupts. * It should also clear any pending interrupts prior to enabling the * IRQs and after disabling them. */ int (*config_intr)(struct phy_device *phydev); /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); /** * @match_phy_device: Returns true if this is a suitable * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY * register changes to enable Wake on LAN, so set_wol is * provided to be called in the ethernet driver's set_wol * function. */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @get_wol: See set_wol, but for checking whether Wake on LAN * is enabled. */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @link_change_notify: Called to inform a PHY device driver * when the core is about to change the link state. This * callback is supposed to be used as fixup hook for drivers * that need to take action when the link state * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); /** * @read_mmd: PHY specific driver override for reading a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD read function * will be used by phy_read_mmd(), which will use either a * direct read for Clause 45 PHYs or an indirect read for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); /** * @write_mmd: PHY specific driver override for writing a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD write function * will be used by phy_write_mmd(), which will use either a * direct write for Clause 45 PHYs, or an indirect write for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); /** * @module_info: Get the size and type of the eeprom contained * within a plug-in module */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); /** * @module_eeprom: Get the eeprom information from the plug-in * module */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); /** * @cable_test_get_status: Once per second, or on interrupt, * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ /** * @get_phy_stats: Retrieve PHY statistics. * @dev: The PHY device for which the statistics are retrieved. * @eth_stats: structure where Ethernet PHY stats will be stored. * @stats: structure where additional PHY-specific stats will be stored. * * Retrieves the supported PHY statistics and populates the provided * structures. The input structures are pre-initialized with * `ETHTOOL_STAT_NOT_SET`, and the driver must only modify members * corresponding to supported statistics. Unmodified members will remain * set to `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. */ void (*get_phy_stats)(struct phy_device *dev, struct ethtool_eth_phy_stats *eth_stats, struct ethtool_phy_stats *stats); /** * @get_link_stats: Retrieve link statistics. * @dev: The PHY device for which the statistics are retrieved. * @link_stats: structure where link-specific stats will be stored. * * Retrieves link-related statistics for the given PHY device. The input * structure is pre-initialized with `ETHTOOL_STAT_NOT_SET`, and the * driver must only modify members corresponding to supported * statistics. Unmodified members will remain set to * `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. */ void (*get_link_stats)(struct phy_device *dev, struct ethtool_link_ext_stats *link_stats); /** * @update_stats: Trigger periodic statistics updates. * @dev: The PHY device for which statistics updates are triggered. * * Periodically gathers statistics from the PHY device to update locally * maintained 64-bit counters. This is necessary for PHYs that implement * reduced-width counters (e.g., 16-bit or 32-bit) which can overflow * more frequently compared to 64-bit counters. By invoking this * callback, drivers can fetch the current counter values, handle * overflow detection, and accumulate the results into local 64-bit * counters for accurate reporting through the `get_phy_stats` and * `get_link_stats` interfaces. * * Return: 0 on success or a negative error code on failure. */ int (*update_stats)(struct phy_device *dev); /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); /* PLCA RS interface */ /** @get_plca_cfg: Return the current PLCA configuration */ int (*get_plca_cfg)(struct phy_device *dev, struct phy_plca_cfg *plca_cfg); /** @set_plca_cfg: Set the PLCA configuration */ int (*set_plca_cfg)(struct phy_device *dev, const struct phy_plca_cfg *plca_cfg); /** @get_plca_status: Return the current PLCA status info */ int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); /** * @led_brightness_set: Set a PHY LED brightness. Index * indicates which of the PHYs led should be set. Value * follows the standard LED class meaning, e.g. LED_OFF, * LED_HALF, LED_FULL. */ int (*led_brightness_set)(struct phy_device *dev, u8 index, enum led_brightness value); /** * @led_blink_set: Set a PHY LED blinking. Index indicates * which of the PHYs led should be configured to blink. Delays * are in milliseconds and if both are zero then a sensible * default should be chosen. The call should adjust the * timings in that case and if it can't match the values * specified exactly. */ int (*led_blink_set)(struct phy_device *dev, u8 index, unsigned long *delay_on, unsigned long *delay_off); /** * @led_hw_is_supported: Can the HW support the given rules. * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The core is interested in these rules * * Return 0 if yes, -EOPNOTSUPP if not, or an error code. */ int (*led_hw_is_supported)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_set: Set the HW to control the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules The rules used to control the LED * * Returns 0, or a an error code. */ int (*led_hw_control_set)(struct phy_device *dev, u8 index, unsigned long rules); /** * @led_hw_control_get: Get how the HW is controlling the LED * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @rules Pointer to the rules used to control the LED * * Set *@rules to how the HW is currently blinking. Returns 0 * on success, or a error code if the current blinking cannot * be represented in rules, or some other error happens. */ int (*led_hw_control_get)(struct phy_device *dev, u8 index, unsigned long *rules); /** * @led_polarity_set: Set the LED polarity modes * @dev: PHY device which has the LED * @index: Which LED of the PHY device * @modes: bitmap of LED polarity modes * * Configure LED with all the required polarity modes in @modes * to make it correctly turn ON or OFF. * * Returns 0, or an error code. */ int (*led_polarity_set)(struct phy_device *dev, int index, unsigned long modes); /** * @get_next_update_time: Get the time until the next update event * @dev: PHY device * * Callback to determine the time (in jiffies) until the next * update event for the PHY state machine. Allows PHY drivers to * dynamically adjust polling intervals based on link state or other * conditions. * * Returns the time in jiffies until the next update event. */ unsigned int (*get_next_update_time)(struct phy_device *dev); }; #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) #define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) #define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) #define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) /** * phy_id_compare - compare @id1 with @id2 taking account of @mask * @id1: first PHY ID * @id2: second PHY ID * @mask: the PHY ID mask, set bits are significant in matching * * Return true if the bits from @id1 and @id2 specified by @mask match. * This uses an equivalent test to (@id & @mask) == (@phy_id & @mask). */ static inline bool phy_id_compare(u32 id1, u32 id2, u32 mask) { return !((id1 ^ id2) & mask); } /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device * @id: the PHY ID to be matched * * Compare the @phydev clause 22 ID with the provided @id and return true or * false depending whether it matches, using the bound driver mask. The * @phydev must be bound to a driver. */ static inline bool phydev_id_compare(struct phy_device *phydev, u32 id) { return phy_id_compare(id, phydev->phy_id, phydev->drv->phy_id_mask); } const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ struct phy_setting { u32 speed; u8 duplex; u8 bit; }; const struct phy_setting * phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct */ static inline bool phy_is_started(struct phy_device *phydev) { return phydev->state >= PHY_UP; } /** * phy_disable_eee_mode - Don't advertise an EEE mode. * @phydev: The phy_device struct * @link_mode: The EEE mode to be disabled */ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode) { WARN_ON(phy_is_started(phydev)); linkmode_set_bit(link_mode, phydev->eee_disabled_modes); linkmode_clear_bit(link_mode, phydev->advertising_eee); } void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * The caller must have taken the MDIO bus lock. */ static inline int __phy_read(struct phy_device *phydev, u32 regnum) { return __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** * phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * The caller must have taken the MDIO bus lock. */ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_modify_changed() - Convenience function for modifying a PHY register * @phydev: a pointer to a &struct phy_device * @regnum: register number * @mask: bit mask of bits to clear * @set: bit mask of bits to set * * Unlocked helper function which allows a PHY register to be modified as * new register value = (old register value & ~mask) | set * * Returns negative errno, 0 if there was no change, and 1 in case of change */ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set) { return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr, regnum, mask, set); } /* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /** * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a * condition is met or a timeout occurs * * @phydev: The phy_device struct * @devaddr: The MMD to read from * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please * read usleep_range() function description for details and * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ int __ret, __val; \ __ret = read_poll_timeout(__val = phy_read_mmd, val, \ __val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, \ phydev, devaddr, regnum); \ if (__val < 0) \ __ret = __val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); /* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); /** * __phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, 0, val); } /** * __phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, val, 0); } /** * phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set */ static inline int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, 0, val); } /** * phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear */ static inline int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, val, 0); } /** * __phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * __phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set */ static inline int phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear */ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_interrupt_is_valid - Convenience function for testing a given PHY irq * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and * PHY_MAC_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { return phydev->irq != PHY_POLL && phydev->irq != PHY_MAC_INTERRUPT; } /** * phy_polling_mode - Convenience function for testing whether polling is * used to detect PHY status changes * @phydev: the phy_device struct */ static inline bool phy_polling_mode(struct phy_device *phydev) { if (phydev->state == PHY_CABLETEST) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; if (phydev->drv->update_stats) return true; return phydev->irq == PHY_POLL; } /** * phy_has_hwtstamp - Tests whether a PHY time stamp configuration. * @phydev: the phy_device struct */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; } /** * phy_has_rxtstamp - Tests whether a PHY supports receive time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_rxtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp; } /** * phy_has_tsinfo - Tests whether a PHY reports time stamping and/or * PTP hardware clock capabilities. * @phydev: the phy_device struct */ static inline bool phy_has_tsinfo(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->ts_info; } /** * phy_has_txtstamp - Tests whether a PHY supports transmit time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_txtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } static inline int phy_hwtstamp(struct phy_device *phydev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type); } static inline int phy_ts_info(struct phy_device *phydev, struct kernel_ethtool_ts_info *tsinfo) { return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } /** * phy_is_default_hwtstamp - Is the PHY hwtstamp the default timestamp * @phydev: Pointer to phy_device * * This is used to get default timestamping device taking into account * the new API choice, which is selecting the timestamping from MAC by * default if the phydev does not have default_timestamp flag enabled. * * Return: True if phy is the default hw timestamp, false otherwise. */ static inline bool phy_is_default_hwtstamp(struct phy_device *phydev) { return phy_has_hwtstamp(phydev) && phydev->default_timestamp; } /** * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module * @phydev: the phy_device struct */ static inline bool phy_on_sfp(struct phy_device *phydev) { return phydev->is_on_sfp_module; } /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { return mode >= PHY_INTERFACE_MODE_RGMII && mode <= PHY_INTERFACE_MODE_RGMII_TXID; }; /** * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) { return mode == PHY_INTERFACE_MODE_1000BASEX || mode == PHY_INTERFACE_MODE_2500BASEX; } /** * phy_interface_is_rgmii - Convenience function for testing if a PHY interface * is RGMII (all variants) * @phydev: the phy_device struct */ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) { return phy_interface_mode_is_rgmii(phydev->interface); }; /** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct */ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) { return phydev->is_pseudo_fixed_link; } int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); int phy_read_paged(struct phy_device *phydev, int page, u32 regnum); int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val); int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); #if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); #else static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) { return 0; } static inline struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) { return 0; } static inline struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) { return NULL; } static inline struct phy_device *device_phy_find_device(struct device *dev) { return NULL; } static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { return NULL; } static inline struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) { return NULL; } static inline int phy_device_register(struct phy_device *phy) { return 0; } static inline void phy_device_free(struct phy_device *phydev) { } #endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), phy_interface_t interface); struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, void (*handler)(struct net_device *), phy_interface_t interface); void phy_disconnect(struct phy_device *phydev); void phy_detach(struct phy_device *phydev); void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); unsigned int phy_inband_caps(struct phy_device *phydev, phy_interface_t interface); int phy_config_inband(struct phy_device *phydev, unsigned int modes); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } #endif static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); } #define phydev_err(_phydev, format, args...) \ dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_err_probe(_phydev, err, format, args...) \ dev_err_probe(&_phydev->mdio.dev, err, format, ##args) #define phydev_info(_phydev, format, args...) \ dev_info(&_phydev->mdio.dev, format, ##args) #define phydev_warn(_phydev, format, args...) \ dev_warn(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { return dev_name(&phydev->mdio.dev); } static inline void phy_lock_mdio_bus(struct phy_device *phydev) { mutex_lock(&phydev->mdio.bus->mdio_lock); } static inline void phy_unlock_mdio_bus(struct phy_device *phydev) { mutex_unlock(&phydev->mdio.bus->mdio_lock); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); int genphy_soft_reset(struct phy_device *phydev); irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev); static inline int genphy_config_aneg(struct phy_device *phydev) { return __genphy_config_aneg(phydev, false); } static inline int genphy_no_config_intr(struct phy_device *phydev) { return 0; } int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum); int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); /* Clause 37 */ int genphy_c37_config_aneg(struct phy_device *phydev); int genphy_c37_read_status(struct phy_device *phydev, bool *changed); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_c45_aneg_done(struct phy_device *phydev); int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); int genphy_c45_pma_baset1_setup_master_slave(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); int genphy_c45_pma_suspend(struct phy_device *phydev); int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable); int genphy_c45_plca_get_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_set_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); static inline int phy_read_status(struct phy_device *phydev) { if (!phydev->drv) return -EIO; if (phydev->drv->read_status) return phydev->drv->read_status(phydev); else return genphy_read_status(phydev); } void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); void phy_state_machine(struct work_struct *work); void phy_trigger_machine(struct phy_device *phydev); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, const struct ethtool_link_ksettings *cmd); int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); int phy_get_rate_matching(struct phy_device *phydev, phy_interface_t iface); void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_support_eee(struct phy_device *phydev); void phy_disable_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); bool phy_validate_pause(struct phy_device *phydev, struct ethtool_pauseparam *pp); void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause); s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx); int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); int phy_register_fixup_for_id(const char *bus_id, int (*run)(struct phy_device *)); int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); int phy_eee_tx_clock_stop_capable(struct phy_device *phydev); int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); int phy_ethtool_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd); int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); int of_phy_package_join(struct phy_device *phydev, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int base_addr, size_t priv_size); int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); void __phy_ethtool_get_phy_stats(struct phy_device *phydev, struct ethtool_eth_phy_stats *phy_stats, struct ethtool_phy_stats *phydev_stats); void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev, struct ethtool_link_ext_stats *link_stats); int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg, struct netlink_ext_ack *extack); int phy_ethtool_get_plca_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int __phy_hwtstamp_get(struct phy_device *phydev, struct kernel_hwtstamp_config *config); int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); static inline int phy_package_address(struct phy_device *phydev, unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; u8 base_addr = shared->base_addr; if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; /* we know that addr will be in the range 0..31 and thus the * implicit cast to a signed int is not a problem. */ return base_addr + addr_offset; } static inline int phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int __phy_package_read(struct phy_device *phydev, unsigned int addr_offset, u32 regnum) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, unsigned int addr_offset, u32 regnum, u16 val) { int addr = phy_package_address(phydev, addr_offset); if (addr < 0) return addr; return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } int __phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); int __phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); int phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { struct phy_package_shared *shared = phydev->shared; if (!shared) return false; return !test_and_set_bit(b, &shared->flags); } static inline bool phy_package_init_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE); } static inline bool phy_package_probe_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); } extern const struct bus_type mdio_bus_type; struct mdio_board_info { const char *bus_id; char modalias[MDIO_NAME_SIZE]; int mdio_addr; const void *platform_data; }; #if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); #else static inline int mdiobus_register_board_info(const struct mdio_board_info *i, unsigned int n) { return 0; } #endif /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ { \ phy_drivers_unregister(__phy_drivers, __count); \ } \ module_exit(phy_module_exit) #define module_phy_driver(__phy_drivers) \ phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers)) bool phy_driver_is_genphy(struct phy_device *phydev); bool phy_driver_is_genphy_10g(struct phy_device *phydev); #endif /* __PHY_H */ |
14 2 8 1 3 7 2 2 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | // SPDX-License-Identifier: GPL-2.0-only #include <net/ip.h> #include <net/tcp.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter/nfnetlink_osf.h> struct nft_osf { u8 dreg; u8 ttl; u32 flags; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, [NFTA_OSF_TTL] = { .type = NLA_U8 }, [NFTA_OSF_FLAGS] = { .type = NLA_U32 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_osf *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; struct sk_buff *skb = pkt->skb; char os_match[NFT_OSF_MAXGENRELEN]; const struct tcphdr *tcp; struct nf_osf_data data; struct tcphdr _tcph; if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; } tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { regs->verdict.code = NFT_BREAK; return; } if (!tcp->syn) { regs->verdict.code = NFT_BREAK; return; } if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) { strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); } else { if (priv->flags & NFT_OSF_F_VERSION) snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", data.genre, data.version); else strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } } static int nft_osf_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_osf *priv = nft_expr_priv(expr); u32 flags; u8 ttl; if (!tb[NFTA_OSF_DREG]) return -EINVAL; if (tb[NFTA_OSF_TTL]) { ttl = nla_get_u8(tb[NFTA_OSF_TTL]); if (ttl > 2) return -EINVAL; priv->ttl = ttl; } if (tb[NFTA_OSF_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS])); if (flags != NFT_OSF_F_VERSION) return -EINVAL; priv->flags = flags; } return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); } static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_osf *priv = nft_expr_priv(expr); if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) goto nla_put_failure; if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags))) goto nla_put_failure; if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_osf_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hooks; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_FORWARD); break; default: return -EOPNOTSUPP; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_osf_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { struct nft_osf *priv = nft_expr_priv(expr); struct nft_osf *osf; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); return false; } osf = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->flags != osf->flags || priv->ttl != osf->ttl) { nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN); return false; } if (!track->regs[priv->dreg].bitwise) return true; return false; } static struct nft_expr_type nft_osf_type; static const struct nft_expr_ops nft_osf_op = { .eval = nft_osf_eval, .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)), .init = nft_osf_init, .dump = nft_osf_dump, .type = &nft_osf_type, .validate = nft_osf_validate, .reduce = nft_osf_reduce, }; static struct nft_expr_type nft_osf_type __read_mostly = { .ops = &nft_osf_op, .name = "osf", .owner = THIS_MODULE, .policy = nft_osf_policy, .maxattr = NFTA_OSF_MAX, }; static int __init nft_osf_module_init(void) { return nft_register_expr(&nft_osf_type); } static void __exit nft_osf_module_exit(void) { return nft_unregister_expr(&nft_osf_type); } module_init(nft_osf_module_init); module_exit(nft_osf_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("osf"); MODULE_DESCRIPTION("nftables passive OS fingerprint support"); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_ESP_H #define _NET_ESP_H #include <linux/skbuff.h> struct ip_esp_hdr; struct xfrm_state; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) { return (struct ip_esp_hdr *)skb_transport_header(skb); } static inline void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) { /* Fill padding... */ if (tfclen) { memset(tail, 0, tfclen); tail += tfclen; } do { int i; for (i = 0; i < plen - 2; i++) tail[i] = i + 1; } while (0); tail[plen - 2] = plen - 2; tail[plen - 1] = proto; } struct esp_info { struct ip_esp_hdr *esph; __be64 seqno; int tfclen; int tailen; int plen; int clen; int len; int nfrags; __u8 proto; bool inplace; }; int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_input_done2(struct sk_buff *skb, int err); int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_input_done2(struct sk_buff *skb, int err); #endif |
120 113 107 17 21 69 18 3 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha1_base.h - core logic for SHA-1 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA1_BASE_H #define _CRYPTO_SHA1_BASE_H #include <crypto/internal/hash.h> #include <crypto/sha1.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks); static inline int sha1_base_init(struct shash_desc *desc) { struct sha1_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SHA1_H0; sctx->state[1] = SHA1_H1; sctx->state[2] = SHA1_H2; sctx->state[3] = SHA1_H3; sctx->state[4] = SHA1_H4; sctx->count = 0; return 0; } static inline int sha1_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha1_block_fn *block_fn) { struct sha1_state *sctx = shash_desc_ctx(desc); unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SHA1_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA1_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buffer, 1); } blocks = len / SHA1_BLOCK_SIZE; len %= SHA1_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA1_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buffer + partial, data, len); return 0; } static inline int sha1_base_do_finalize(struct shash_desc *desc, sha1_block_fn *block_fn) { const int bit_offset = SHA1_BLOCK_SIZE - sizeof(__be64); struct sha1_state *sctx = shash_desc_ctx(desc); __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; sctx->buffer[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buffer + partial, 0x0, SHA1_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buffer, 1); } memset(sctx->buffer + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buffer, 1); return 0; } static inline int sha1_base_finish(struct shash_desc *desc, u8 *out) { struct sha1_state *sctx = shash_desc_ctx(desc); __be32 *digest = (__be32 *)out; int i; for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } #endif /* _CRYPTO_SHA1_BASE_H */ |
121 1146 104 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RANDOM_H #define _LINUX_RANDOM_H #include <linux/bug.h> #include <linux/kernel.h> #include <linux/list.h> #include <uapi/linux/random.h> struct notifier_block; void add_device_randomness(const void *buf, size_t len); void __init add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after); static inline void add_latent_entropy(void) { #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); #else add_device_randomness(NULL, 0); #endif } #if IS_ENABLED(CONFIG_VMGENID) void add_vmfork_randomness(const void *unique_vm_id, size_t len); int register_random_vmfork_notifier(struct notifier_block *nb); int unregister_random_vmfork_notifier(struct notifier_block *nb); #else static inline int register_random_vmfork_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; } #endif void get_random_bytes(void *buf, size_t len); u8 get_random_u8(void); u16 get_random_u16(void); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned long get_random_long(void) { #if BITS_PER_LONG == 64 return get_random_u64(); #else return get_random_u32(); #endif } u32 __get_random_u32_below(u32 ceil); /* * Returns a random integer in the interval [0, ceil), with uniform * distribution, suitable for all uses. Fastest when ceil is a constant, but * still fast for variable ceil as well. */ static inline u32 get_random_u32_below(u32 ceil) { if (!__builtin_constant_p(ceil)) return __get_random_u32_below(ceil); /* * For the fast path, below, all operations on ceil are precomputed by * the compiler, so this incurs no overhead for checking pow2, doing * divisions, or branching based on integer size. The resultant * algorithm does traditional reciprocal multiplication (typically * optimized by the compiler into shifts and adds), rejecting samples * whose lower half would indicate a range indivisible by ceil. */ BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0"); if (ceil <= 1) return 0; for (;;) { if (ceil <= 1U << 8) { u32 mult = ceil * get_random_u8(); if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) return mult >> 8; } else if (ceil <= 1U << 16) { u32 mult = ceil * get_random_u16(); if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) return mult >> 16; } else { u64 mult = (u64)ceil * get_random_u32(); if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) return mult >> 32; } } } /* * Returns a random integer in the interval (floor, U32_MAX], with uniform * distribution, suitable for all uses. Fastest when floor is a constant, but * still fast for variable floor as well. */ static inline u32 get_random_u32_above(u32 floor) { BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX, "get_random_u32_above() must take floor < U32_MAX"); return floor + 1 + get_random_u32_below(U32_MAX - floor); } /* * Returns a random integer in the interval [floor, ceil], with uniform * distribution, suitable for all uses. Fastest when floor and ceil are * constant, but still fast for variable floor and ceil as well. */ static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil) { BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) && (floor > ceil || ceil - floor == U32_MAX), "get_random_u32_inclusive() must take floor <= ceil"); return floor + get_random_u32_below(ceil - floor + 1); } void __init random_init_early(const char *command_line); void __init random_init(void); bool rng_is_initialized(void); int wait_for_random_bytes(void); int execute_with_initialized_rng(struct notifier_block *nb); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ static inline int get_random_bytes_wait(void *buf, size_t nbytes) { int ret = wait_for_random_bytes(); get_random_bytes(buf, nbytes); return ret; } #define declare_get_random_var_wait(name, ret_type) \ static inline int get_random_ ## name ## _wait(ret_type *out) { \ int ret = wait_for_random_bytes(); \ if (unlikely(ret)) \ return ret; \ *out = get_random_ ## name(); \ return 0; \ } declare_get_random_var_wait(u8, u8) declare_get_random_var_wait(u16, u16) declare_get_random_var_wait(u32, u32) declare_get_random_var_wait(u64, u32) declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); #endif #ifndef MODULE extern const struct file_operations random_fops, urandom_fops; #endif #endif /* _LINUX_RANDOM_H */ |
132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * smc_sysctl.c: sysctl interface to SMC subsystem. * * Copyright (c) 2022, Alibaba Inc. * * Author: Tony Lu <tonylu@linux.alibaba.com> * */ #include <linux/init.h> #include <linux/sysctl.h> #include <net/net_namespace.h> #include "smc.h" #include "smc_core.h" #include "smc_llc.h" #include "smc_sysctl.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; static int max_sndbuf = INT_MAX / 2; static int max_rcvbuf = INT_MAX / 2; static const int net_smc_wmem_init = (64 * 1024); static const int net_smc_rmem_init = (64 * 1024); static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; static struct ctl_table smc_table[] = { { .procname = "autocorking_size", .data = &init_net.smc.sysctl_autocorking_size, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec, }, { .procname = "smcr_buf_type", .data = &init_net.smc.sysctl_smcr_buf_type, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "smcr_testlink_time", .data = &init_net.smc.sysctl_smcr_testlink_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "wmem", .data = &init_net.smc.sysctl_wmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, .extra2 = &max_sndbuf, }, { .procname = "rmem", .data = &init_net.smc.sysctl_rmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, .extra2 = &max_rcvbuf, }, { .procname = "smcr_max_links_per_lgr", .data = &init_net.smc.sysctl_max_links_per_lgr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &links_per_lgr_min, .extra2 = &links_per_lgr_max, }, { .procname = "smcr_max_conns_per_lgr", .data = &init_net.smc.sysctl_max_conns_per_lgr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, { .procname = "limit_smc_hs", .data = &init_net.smc.limit_smc_hs, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; int __net_init smc_sysctl_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; if (!net_eq(net, &init_net)) { int i; table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) goto err_alloc; for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, table_size); if (!net->smc.smc_hdr) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; /* disable handshake limitation by default */ net->smc.limit_smc_hs = 0; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } void __net_exit smc_sysctl_net_exit(struct net *net) { const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); if (!net_eq(net, &init_net)) kfree(table); } |
622 621 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) { switch (behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ return 1; } } #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, anon_name); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; } return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t pte, bool *any_young, bool *any_dirty) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young, any_dirty; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, &any_dirty); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } if (any_young) ptent = pte_mkyoung(ptent); if (any_dirty) ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct zap_details details = { .reclaim_pt = true, .even_cows = true, }; zap_page_range_single(vma, start, end - start, &details); return 0; } static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long start, unsigned long *end, int behavior) { if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { struct mm_struct *mm = vma->vm_mm; *prev = vma; if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (start == end) return 0; if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ mmap_read_lock(mm); vma = vma_lookup(mm, start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ end = vma->vm_end; } /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address. */ if (start == end) return 0; VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); else return -EINVAL; } static long madvise_populate(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; int locked = 1; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { loff_t offset; int error; struct file *f; struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; if (!vma_is_anonymous(vma)) return false; if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) return false; return true; } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval) || pud_devmap(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { long err; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, start, end - start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval) || pud_devmap(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range(vma->vm_mm, start, end, &guard_remove_walk_ops, NULL); } /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long behavior) { int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (vma->vm_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (vma->vm_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); case MADV_GUARD_INSTALL: return madvise_guard_install(vma, prev, start, end); case MADV_GUARD_REMOVE: return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); anon_vma_name_put(anon_name); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } #endif static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap * between the current vma and the original range. Any unmapped regions in the * original range will result in this function returning -ENOMEM while still * calling the visit function on all of the existing vmas in the range. * Must be called with the mmap_lock held for reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; for (;;) { int error; /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) break; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ tmp = vma->vm_end; if (end < tmp) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = visit(vma, &prev, start, tmp, arg); if (error) return error; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; if (start >= end) break; if (prev) vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } return unmapped_error; } #ifdef CONFIG_ANON_VMA_NAME static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long anon_name) { int error; /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end; int error; int write; size_t len; struct blk_plug plug; if (!madvise_behavior_valid(behavior)) return -EINVAL; if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif write = madvise_need_mmap_write(behavior); if (write) { if (mmap_write_lock_killable(mm)) return -EINTR; } else { mmap_read_lock(mm); } start = untagged_addr_remote(mm, start); end = start + len; blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: error = madvise_populate(mm, start, end, behavior); break; default: error = madvise_walk_vmas(mm, start, end, behavior, madvise_vma_behavior); break; } blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); else mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; total_len = iov_iter_count(iter); while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * As we have already dropped locks, it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; } |
105 101 101 1 100 101 100 100 1 1 1 1 1 95 94 95 1 1 1 1 1 1 93 95 2 2 1 1 1 2 2 12 1 4 4 3 2 4 2 2 1 1 76 76 117 94 93 2 3 7 7 74 108 8 106 9 53 7 47 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 | /* * net/tipc/bcast.c: TIPC broadcast code * * Copyright (c) 2004-2006, 2014-2017, Ericsson AB * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/tipc_config.h> #include "socket.h" #include "msg.h" #include "bcast.h" #include "link.h" #include "name_table.h" #define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ #define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; unsigned long sysctl_tipc_bc_retruni __read_mostly; /** * struct tipc_bc_base - base structure for keeping broadcast send state * @link: broadcast send link structure * @inputq: data input queue; will only carry SOCK_WAKEUP messages * @dests: array keeping number of reachable destinations per bearer * @primary_bearer: a bearer having links to all broadcast destinations, if any * @bcast_support: indicates if primary bearer, if any, supports broadcast * @force_bcast: forces broadcast for multicast traffic * @rcast_support: indicates if all peer nodes support replicast * @force_rcast: forces replicast for multicast traffic * @rc_ratio: dest count as percentage of cluster size where send method changes * @bc_threshold: calculated from rc_ratio; if dests > threshold use broadcast */ struct tipc_bc_base { struct tipc_link *link; struct sk_buff_head inputq; int dests[MAX_BEARERS]; int primary_bearer; bool bcast_support; bool force_bcast; bool rcast_support; bool force_rcast; int rc_ratio; int bc_threshold; }; static struct tipc_bc_base *tipc_bc_base(struct net *net) { return tipc_net(net)->bcbase; } /* tipc_bcast_get_mtu(): -get the MTU currently used by broadcast link * Note: the MTU is decremented to give room for a tunnel header, in * case the message needs to be sent as replicast */ int tipc_bcast_get_mtu(struct net *net) { return tipc_link_mss(tipc_bc_sndlink(net)); } void tipc_bcast_toggle_rcast(struct net *net, bool supp) { tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int cluster_size = tipc_link_bc_peers(tipc_bc_sndlink(net)); bb->bc_threshold = 1 + (cluster_size * bb->rc_ratio / 100); } /* tipc_bcbase_select_primary(): find a bearer with links to all destinations, * if any, and make it primary bearer */ static void tipc_bcbase_select_primary(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); int all_dests = tipc_link_bc_peers(bb->link); int max_win = tipc_link_max_win(bb->link); int min_win = tipc_link_min_win(bb->link); int i, mtu, prim; bb->primary_bearer = INVALID_BEARER_ID; bb->bcast_support = true; if (!all_dests) return; for (i = 0; i < MAX_BEARERS; i++) { if (!bb->dests[i]) continue; mtu = tipc_bearer_mtu(net, i); if (mtu < tipc_link_mtu(bb->link)) { tipc_link_set_mtu(bb->link, mtu); tipc_link_set_queue_limits(bb->link, min_win, max_win); } bb->bcast_support &= tipc_bearer_bcast_support(net, i); if (bb->dests[i] < all_dests) continue; bb->primary_bearer = i; /* Reduce risk that all nodes select same primary */ if ((i ^ tipc_own_addr(net)) & 1) break; } prim = bb->primary_bearer; if (prim != INVALID_BEARER_ID) bb->bcast_support = tipc_bearer_bcast_support(net, prim); } void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id) { struct tipc_bc_base *bb = tipc_bc_base(net); tipc_bcast_lock(net); bb->dests[bearer_id]++; tipc_bcbase_select_primary(net); tipc_bcast_unlock(net); } void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id) { struct tipc_bc_base *bb = tipc_bc_base(net); tipc_bcast_lock(net); bb->dests[bearer_id]--; tipc_bcbase_select_primary(net); tipc_bcast_unlock(net); } /* tipc_bcbase_xmit - broadcast a packet queue across one or more bearers * * Note that number of reachable destinations, as indicated in the dests[] * array, may transitionally differ from the number of destinations indicated * in each sent buffer. We can sustain this. Excess destination nodes will * drop and never acknowledge the unexpected packets, and missing destinations * will either require retransmission (if they are just about to be added to * the bearer), or be removed from the buffer's 'ackers' counter (if they * just went down) */ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) { int bearer_id; struct tipc_bc_base *bb = tipc_bc_base(net); struct sk_buff *skb, *_skb; struct sk_buff_head _xmitq; if (skb_queue_empty(xmitq)) return; /* The typical case: at least one bearer has links to all nodes */ bearer_id = bb->primary_bearer; if (bearer_id >= 0) { tipc_bearer_bc_xmit(net, bearer_id, xmitq); return; } /* We have to transmit across all bearers */ __skb_queue_head_init(&_xmitq); for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { if (!bb->dests[bearer_id]) continue; skb_queue_walk(xmitq, skb) { _skb = pskb_copy_for_clone(skb, GFP_ATOMIC); if (!_skb) break; __skb_queue_tail(&_xmitq, _skb); } tipc_bearer_bc_xmit(net, bearer_id, &_xmitq); } __skb_queue_purge(xmitq); __skb_queue_purge(&_xmitq); } static void tipc_bcast_select_xmit_method(struct net *net, int dests, struct tipc_mc_method *method) { struct tipc_bc_base *bb = tipc_bc_base(net); unsigned long exp = method->expires; /* Broadcast supported by used bearer/bearers? */ if (!bb->bcast_support) { method->rcast = true; return; } /* Any destinations which don't support replicast ? */ if (!bb->rcast_support) { method->rcast = false; return; } /* Can current method be changed ? */ method->expires = jiffies + TIPC_METHOD_EXPIRE; if (method->mandatory) return; if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL) && time_before(jiffies, exp)) return; /* Configuration as force 'broadcast' method */ if (bb->force_bcast) { method->rcast = false; return; } /* Configuration as force 'replicast' method */ if (bb->force_rcast) { method->rcast = true; return; } /* Configuration as 'autoselect' or default method */ /* Determine method to use now */ method->rcast = dests <= bb->bc_threshold; } /* tipc_bcast_xmit - broadcast the buffer chain to all external nodes * @net: the applicable net namespace * @pkts: chain of buffers containing message * @cong_link_cnt: set to 1 if broadcast link is congested, otherwise 0 * Consumes the buffer chain. * Returns 0 if success, otherwise errno: -EHOSTUNREACH,-EMSGSIZE */ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, u16 *cong_link_cnt) { struct tipc_link *l = tipc_bc_sndlink(net); struct sk_buff_head xmitq; int rc = 0; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); __skb_queue_purge(pkts); if (rc == -ELINKCONG) { *cong_link_cnt = 1; rc = 0; } return rc; } /* tipc_rcast_xmit - replicate and send a message to given destination nodes * @net: the applicable net namespace * @pkts: chain of buffers containing message * @dests: list of destination nodes * @cong_link_cnt: returns number of congested links * @cong_links: returns identities of congested links * Returns 0 if success, otherwise errno */ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_nlist *dests, u16 *cong_link_cnt) { struct tipc_dest *dst, *tmp; struct sk_buff_head _pkts; u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); __skb_queue_head_init(&_pkts); list_for_each_entry_safe(dst, tmp, &dests->list, list) { dnode = dst->node; if (!tipc_msg_pskb_copy(dnode, pkts, &_pkts)) return -ENOMEM; /* Any other return value than -ELINKCONG is ignored */ if (tipc_node_xmit(net, &_pkts, dnode, selector) == -ELINKCONG) (*cong_link_cnt)++; } return 0; } /* tipc_mcast_send_sync - deliver a dummy message with SYN bit * @net: the applicable net namespace * @skb: socket buffer to copy * @method: send method to be used * @dests: destination nodes for message. * Returns 0 if success, otherwise errno */ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, struct tipc_mc_method *method, struct tipc_nlist *dests) { struct tipc_msg *hdr, *_hdr; struct sk_buff_head tmpq; u16 cong_link_cnt = 0; struct sk_buff *_skb; int rc = 0; /* Is a cluster supporting with new capabilities ? */ if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) return 0; hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) hdr = msg_inner_hdr(hdr); if (msg_type(hdr) != TIPC_MCAST_MSG) return 0; /* Allocate dummy message */ _skb = tipc_buf_acquire(MCAST_H_SIZE, GFP_KERNEL); if (!_skb) return -ENOMEM; /* Preparing for 'synching' header */ msg_set_syn(hdr, 1); /* Copy skb's header into a dummy header */ skb_copy_to_linear_data(_skb, hdr, MCAST_H_SIZE); skb_orphan(_skb); /* Reverse method for dummy message */ _hdr = buf_msg(_skb); msg_set_size(_hdr, MCAST_H_SIZE); msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); __skb_queue_head_init(&tmpq); __skb_queue_tail(&tmpq, _skb); if (method->rcast) rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); else rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); /* This queue should normally be empty by now */ __skb_queue_purge(&tmpq); return rc; } /* tipc_mcast_xmit - deliver message to indicated destination nodes * and to identified node local sockets * @net: the applicable net namespace * @pkts: chain of buffers containing message * @method: send method to be used * @dests: destination nodes for message. * @cong_link_cnt: returns number of encountered congested destination links * Consumes buffer chain. * Returns 0 if success, otherwise errno */ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt) { struct sk_buff_head inputq, localq; bool rcast = method->rcast; struct tipc_msg *hdr; struct sk_buff *skb; int rc = 0; skb_queue_head_init(&inputq); __skb_queue_head_init(&localq); /* Clone packets before they are consumed by next call */ if (dests->local && !tipc_msg_reassemble(pkts, &localq)) { rc = -ENOMEM; goto exit; } /* Send according to determined transmit method */ if (dests->remote) { tipc_bcast_select_xmit_method(net, dests->remote, method); skb = skb_peek(pkts); hdr = buf_msg(skb); if (msg_user(hdr) == MSG_FRAGMENTER) hdr = msg_inner_hdr(hdr); msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ if (rcast != method->rcast) { rc = tipc_mcast_send_sync(net, skb, method, dests); if (unlikely(rc)) { pr_err("Unable to send SYN: method %d, rc %d\n", rcast, rc); goto exit; } } if (method->rcast) rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); else rc = tipc_bcast_xmit(net, pkts, cong_link_cnt); } if (dests->local) { tipc_loopback_trace(net, &localq); tipc_sk_mcast_rcv(net, &localq, &inputq); } exit: /* This queue should normally be empty by now */ __skb_queue_purge(pkts); return rc; } /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link * * RCU is locked, no other locks set */ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb) { struct tipc_msg *hdr = buf_msg(skb); struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct sk_buff_head xmitq; int rc; __skb_queue_head_init(&xmitq); if (msg_mc_netid(hdr) != tipc_netid(net) || !tipc_link_is_up(l)) { kfree_skb(skb); return 0; } tipc_bcast_lock(net); if (msg_user(hdr) == BCAST_PROTOCOL) rc = tipc_link_bc_nack_rcv(l, skb, &xmitq); else rc = tipc_link_rcv(l, skb, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); return rc; } /* tipc_bcast_ack_rcv - receive and handle a broadcast acknowledge * * RCU is locked, no other locks set */ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; u16 acked = msg_bcast_ack(hdr); struct sk_buff_head xmitq; /* Ignore bc acks sent by peer before bcast synch point was received */ if (msg_bc_ack_invalid(hdr)) return; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); } /* tipc_bcast_synch_rcv - check and update rcv link with peer's send state * * RCU is locked, no other locks set */ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *retrq) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct tipc_gap_ack_blks *ga; struct sk_buff_head xmitq; int rc = 0; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (msg_type(hdr) != STATE_MSG) { tipc_link_bc_init_rcv(l, hdr); } else if (!msg_bc_ack_invalid(hdr)) { tipc_get_gap_ack_blks(&ga, l, hdr, false); if (!sysctl_tipc_bc_retruni) retrq = &xmitq; rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), msg_bc_gap(hdr), ga, &xmitq, retrq); rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq); } tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); return rc; } /* tipc_bcast_add_peer - add a peer node to broadcast link and bearer * * RCU is locked, node lock is set */ void tipc_bcast_add_peer(struct net *net, struct tipc_link *uc_l, struct sk_buff_head *xmitq) { struct tipc_link *snd_l = tipc_bc_sndlink(net); tipc_bcast_lock(net); tipc_link_add_bc_peer(snd_l, uc_l, xmitq); tipc_bcbase_select_primary(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); } /* tipc_bcast_remove_peer - remove a peer node from broadcast link and bearer * * RCU is locked, node lock is set */ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) { struct tipc_link *snd_l = tipc_bc_sndlink(net); struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); tipc_link_remove_bc_peer(snd_l, rcv_l, &xmitq); tipc_bcbase_select_primary(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); } int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l) { if (!l) return -ENOPROTOOPT; tipc_bcast_lock(net); tipc_link_reset_stats(l); tipc_bcast_unlock(net); return 0; } static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; if (max_win < BCLINK_WIN_MIN) max_win = BCLINK_WIN_MIN; if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); tipc_link_set_queue_limits(l, tipc_link_min_win(l), max_win); tipc_bcast_unlock(net); return 0; } static int tipc_bc_link_set_broadcast_mode(struct net *net, u32 bc_mode) { struct tipc_bc_base *bb = tipc_bc_base(net); switch (bc_mode) { case BCLINK_MODE_BCAST: if (!bb->bcast_support) return -ENOPROTOOPT; bb->force_bcast = true; bb->force_rcast = false; break; case BCLINK_MODE_RCAST: if (!bb->rcast_support) return -ENOPROTOOPT; bb->force_bcast = false; bb->force_rcast = true; break; case BCLINK_MODE_SEL: if (!bb->bcast_support || !bb->rcast_support) return -ENOPROTOOPT; bb->force_bcast = false; bb->force_rcast = false; break; default: return -EINVAL; } return 0; } static int tipc_bc_link_set_broadcast_ratio(struct net *net, u32 bc_ratio) { struct tipc_bc_base *bb = tipc_bc_base(net); if (!bb->bcast_support || !bb->rcast_support) return -ENOPROTOOPT; if (bc_ratio > 100 || bc_ratio <= 0) return -EINVAL; bb->rc_ratio = bc_ratio; tipc_bcast_lock(net); tipc_bcbase_calc_bc_threshold(net); tipc_bcast_unlock(net); return 0; } int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]) { int err; u32 win; u32 bc_mode; u32 bc_ratio; struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; if (!attrs[TIPC_NLA_LINK_PROP]) return -EINVAL; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) return err; if (!props[TIPC_NLA_PROP_WIN] && !props[TIPC_NLA_PROP_BROADCAST] && !props[TIPC_NLA_PROP_BROADCAST_RATIO]) { return -EOPNOTSUPP; } if (props[TIPC_NLA_PROP_BROADCAST]) { bc_mode = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST]); err = tipc_bc_link_set_broadcast_mode(net, bc_mode); } if (!err && props[TIPC_NLA_PROP_BROADCAST_RATIO]) { bc_ratio = nla_get_u32(props[TIPC_NLA_PROP_BROADCAST_RATIO]); err = tipc_bc_link_set_broadcast_ratio(net, bc_ratio); } if (!err && props[TIPC_NLA_PROP_WIN]) { win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); err = tipc_bc_link_set_queue_limits(net, win); } return err; } int tipc_bcast_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_bc_base *bb = NULL; struct tipc_link *l = NULL; bb = kzalloc(sizeof(*bb), GFP_KERNEL); if (!bb) goto enomem; tn->bcbase = bb; spin_lock_init(&tipc_net(net)->bclock); if (!tipc_link_bc_create(net, 0, 0, NULL, one_page_mtu, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, NULL, &l)) goto enomem; bb->link = l; tn->bcl = l; bb->rc_ratio = 10; bb->rcast_support = true; return 0; enomem: kfree(bb); kfree(l); return -ENOMEM; } void tipc_bcast_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); synchronize_net(); kfree(tn->bcbase); kfree(tn->bcl); } void tipc_nlist_init(struct tipc_nlist *nl, u32 self) { memset(nl, 0, sizeof(*nl)); INIT_LIST_HEAD(&nl->list); nl->self = self; } void tipc_nlist_add(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = true; else if (tipc_dest_push(&nl->list, node, 0)) nl->remote++; } void tipc_nlist_del(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = false; else if (tipc_dest_del(&nl->list, node, 0)) nl->remote--; } void tipc_nlist_purge(struct tipc_nlist *nl) { tipc_dest_list_purge(&nl->list); nl->remote = 0; nl->local = false; } u32 tipc_bcast_get_mode(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); if (bb->force_bcast) return BCLINK_MODE_BCAST; if (bb->force_rcast) return BCLINK_MODE_RCAST; if (bb->bcast_support && bb->rcast_support) return BCLINK_MODE_SEL; return 0; } u32 tipc_bcast_get_broadcast_ratio(struct net *net) { struct tipc_bc_base *bb = tipc_bc_base(net); return bb->rc_ratio; } void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, struct sk_buff_head *inputq) { struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr, *_hdr; bool match = false; u32 node, port; skb = skb_peek(inputq); if (!skb) return; hdr = buf_msg(skb); if (likely(!msg_is_syn(hdr) && skb_queue_empty(defq))) return; node = msg_orignode(hdr); if (node == tipc_own_addr(net)) return; port = msg_origport(hdr); /* Has the twin SYN message already arrived ? */ skb_queue_walk(defq, _skb) { _hdr = buf_msg(_skb); if (msg_orignode(_hdr) != node) continue; if (msg_origport(_hdr) != port) continue; match = true; break; } if (!match) { if (!msg_is_syn(hdr)) return; __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Deliver non-SYN message from other link, otherwise queue it */ if (!msg_is_syn(hdr)) { if (msg_is_rcast(hdr) != msg_is_rcast(_hdr)) return; __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Queue non-SYN/SYN message from same link */ if (msg_is_rcast(hdr) == msg_is_rcast(_hdr)) { __skb_dequeue(inputq); __skb_queue_tail(defq, skb); return; } /* Matching SYN messages => return the one with data, if any */ __skb_unlink(_skb, defq); if (msg_data_sz(hdr)) { kfree_skb(_skb); } else { __skb_dequeue(inputq); kfree_skb(skb); __skb_queue_tail(inputq, _skb); } /* Deliver subsequent non-SYN messages from same peer */ skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (msg_orignode(_hdr) != node) continue; if (msg_origport(_hdr) != port) continue; if (msg_is_syn(_hdr)) break; __skb_unlink(_skb, defq); __skb_queue_tail(inputq, _skb); } } |
72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /* Read an error pointer from the percpu address space. */ #define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /* Read an error pointer from the percpu address space. */ #define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */ |
37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Queue of folios definitions * * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See: * * Documentation/core-api/folio_queue.rst * * for a description of the API. */ #ifndef _LINUX_FOLIO_QUEUE_H #define _LINUX_FOLIO_QUEUE_H #include <linux/pagevec.h> /* * Segment in a queue of running buffers. Each segment can hold a number of * folios and a portion of the queue can be referenced with the ITER_FOLIOQ * iterator. The possibility exists of inserting non-folio elements into the * queue (such as gaps). * * Explicit prev and next pointers are used instead of a list_head to make it * easier to add segments to tail and remove them from the head without the * need for a lock. */ struct folio_queue { struct folio_batch vec; /* Folios in the queue segment */ u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ struct folio_queue *next; /* Next queue segment or NULL */ struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ unsigned long marks3; /* Third 1-bit mark per folio */ #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif unsigned int rreq_id; unsigned int debug_id; }; /** * folioq_init - Initialise a folio queue segment * @folioq: The segment to initialise * @rreq_id: The request identifier to use in tracelines. * * Initialise a folio queue segment and set an identifier to be used in traces. * * Note that the folio pointers are left uninitialised. */ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) { folio_batch_init(&folioq->vec); folioq->next = NULL; folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; folioq->marks3 = 0; folioq->rreq_id = rreq_id; folioq->debug_id = 0; } /** * folioq_nr_slots: Query the capacity of a folio queue segment * @folioq: The segment to query * * Query the number of folios that a particular folio queue segment might hold. * [!] NOTE: This must not be assumed to be the same for every segment! */ static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) { return PAGEVEC_SIZE; } /** * folioq_count: Query the occupancy of a folio queue segment * @folioq: The segment to query * * Query the number of folios that have been added to a folio queue segment. * Note that this is not decreased as folios are removed from a segment. */ static inline unsigned int folioq_count(struct folio_queue *folioq) { return folio_batch_count(&folioq->vec); } /** * folioq_full: Query if a folio queue segment is full * @folioq: The segment to query * * Query if a folio queue segment is fully occupied. Note that this does not * change if folios are removed from a segment. */ static inline bool folioq_full(struct folio_queue *folioq) { //return !folio_batch_space(&folioq->vec); return folioq_count(folioq) >= folioq_nr_slots(folioq); } /** * folioq_is_marked: Check first folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the first mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks); } /** * folioq_mark: Set the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks); } /** * folioq_unmark: Clear the first mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the first mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks); } /** * folioq_is_marked2: Check second folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the second mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks2); } /** * folioq_mark2: Set the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks2); } /** * folioq_unmark2: Clear the second mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the second mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks2); } /** * folioq_is_marked3: Check third folio mark in a folio queue segment * @folioq: The segment to query * @slot: The slot number of the folio to query * * Determine if the third mark is set for the folio in the specified slot in a * folio queue segment. */ static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) { return test_bit(slot, &folioq->marks3); } /** * folioq_mark3: Set the third mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Set the third mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) { set_bit(slot, &folioq->marks3); } /** * folioq_unmark3: Clear the third mark on a folio in a folio queue segment * @folioq: The segment to modify * @slot: The slot number of the folio to modify * * Clear the third mark for the folio in the specified slot in a folio queue * segment. */ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) { clear_bit(slot, &folioq->marks3); } static inline unsigned int __folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio->_flags_1 & 0xff; } /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue and the marks are left * unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = __folio_order(folio); return slot; } /** * folioq_append_mark: Add a folio to a folio queue segment * @folioq: The segment to add to * @folio: The folio to add * * Add a folio to the tail of the sequence in a folio queue segment, increasing * the occupancy count and returning the slot number for the folio just added. * The folio size is extracted and stored in the queue, the first mark is set * and and the second and third marks are left unmodified. * * Note that it's left up to the caller to check that the segment capacity will * not be exceeded and to extend the queue. */ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) { unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; folioq->orders[slot] = __folio_order(folio); folioq_mark(folioq, slot); return slot; } /** * folioq_folio: Get a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the folio in the specified slot from a folio queue segment. Note * that no bounds check is made and if the slot hasn't been added into yet, the * pointer will be undefined. If the slot has been cleared, NULL will be * returned. */ static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) { return folioq->vec.folios[slot]; } /** * folioq_folio_order: Get the order of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the order of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the order returned will be 0. */ static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) { return folioq->orders[slot]; } /** * folioq_folio_size: Get the size of a folio from a folio queue segment * @folioq: The segment to access * @slot: The folio slot to access * * Retrieve the size of the folio in the specified slot from a folio queue * segment. Note that no bounds check is made and if the slot hasn't been * added into yet, the size returned will be PAGE_SIZE. */ static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) { return PAGE_SIZE << folioq_folio_order(folioq, slot); } /** * folioq_clear: Clear a folio from a folio queue segment * @folioq: The segment to clear * @slot: The folio slot to clear * * Clear a folio from a sequence in a folio queue segment and clear its marks. * The occupancy count is left unchanged. */ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) { folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); folioq_unmark3(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ |
132 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 | // SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/xprtrdma.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/nfslocalio.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_RWLOCK(nfs_version_lock); static struct nfs_subversion *nfs_version_mods[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .pipe_dir_name = NFS_PIPE_DIRNAME, }; static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; read_lock(&nfs_version_lock); nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); return nfs; } struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs = __find_nfs_version(version); if (!nfs && request_module("nfsv%d", version) == 0) nfs = __find_nfs_version(version); if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } int get_nfs_version(struct nfs_subversion *nfs) { return try_module_get(nfs->owner); } EXPORT_SYMBOL_GPL(get_nfs_version); void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); } void register_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; nfs_version_mods[nfs->rpc_ops->version] = NULL; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record * * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); nfs_uuid_init(&clp->cl_uuid); INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work); #endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: put_nfs_version(clp->cl_nfs_mod); error_dealloc: kfree(clp); error_0: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } #else static void nfs_cleanup_cb_ident_idr(struct net *net) { } static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { } static void pnfs_init_server(struct nfs_server *server) { } #endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record */ void nfs_free_client(struct nfs_client *clp) { nfs_localio_disable_client(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); put_net(clp->cl_net); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { struct nfs_net *nn; if (!clp) return; nn = net_generic(clp->cl_net, nfs_net_id); if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. */ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* If a client is still initializing then we need to wait */ if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); goto again; } /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; /* Match request for a dedicated DS */ if (test_bit(NFS_CS_DS, &data->init_flags) != test_bit(NFS_CS_DS, &clp->cl_flags)) continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ if (IS_ERR(clp->cl_rpcclient) || !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, sap)) continue; /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; refcount_inc(&clp->cl_count); return clp; } return NULL; } /* * Return true if @clp is done initializing, false if still working on it. * * Use nfs_client_init_status to check if it was successful. */ bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); /* * Return 0 if @clp was successfully initialized, -errno otherwise. * * This must be called *after* nfs_client_init_is_complete() returns true, * otherwise it will pop WARN_ON_ONCE and return -EINVAL */ int nfs_client_init_status(const struct nfs_client *clp) { /* called without checking nfs_client_init_is_complete */ if (clp->cl_cons_state > NFS_CS_READY) { WARN_ON_ONCE(1); return -EINVAL; } return clp->cl_cons_state; } EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. */ static struct nfs_client * nfs_found_client(const struct nfs_client_initdata *cl_init, struct nfs_client *clp) { int error; error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); } if (clp->cl_cons_state < NFS_CS_READY) { error = clp->cl_cons_state; nfs_put_client(clp); return ERR_PTR(error); } smp_rmb(); return clp; } /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; if (cl_init->hostname == NULL) { WARN_ON(1); return ERR_PTR(-EINVAL); } /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); if (IS_ERR(clp)) return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) nfs_local_probe(new); return new; } spin_unlock(&nn->nfs_client_lock); new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) to->to_maxval = NFS_MAX_TCP_TIMEOUT; if (to->to_maxval < to->to_initval) to->to_maxval = to->to_initval; to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; default: BUG(); } } EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, .connect_timeout = cl_init->connect_timeout, .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { if (server->nlm_host) nlmclnt_done(server->nlm_host); } /* * Version 2 or 3 lockd setup */ static int nfs_start_lockd(struct nfs_server *server) { struct nlm_host *host; struct nfs_client *clp = server->nfs_client; struct nlmclnt_initdata nlm_init = { .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, .cred = server->cred, }; if (nlm_init.nfs_version > 3) return 0; if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { default: nlm_init.protocol = IPPROTO_TCP; break; #ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; #endif } host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); server->nlm_host = host; server->destroy = nfs_destroy_server; nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } /* * Create a general RPC client */ int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } memcpy(&server->client->cl_timeout_default, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFTERR) server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { int error; /* the client is already initialised */ if (clp->cl_cons_state == NFS_CS_READY) return clp; /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); if (error < 0) { nfs_put_client(clp); clp = ERR_PTR(error); } return clp; } EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, const struct fs_context *fc) { const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, .net = fc->net_ns, .timeparms = &timeparms, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, ctx->timeo, ctx->retrans); if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: server->fattr_valid = NFS_ATTR_FATTR_V2; break; case 3: server->fattr_valid = NFS_ATTR_FATTR_V3; break; default: server->fattr_valid = NFS_ATTR_FATTR_V4; } if (ctx->rsize) server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; server->acdirmin = ctx->acdirmin * HZ; server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; server->port = ctx->nfs_server.port; server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, ctx->selected_flavor); if (error < 0) goto error; /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, ctx->mount_server.addrlen); server->mountd_addrlen = ctx->mount_server.addrlen; } server->mountd_version = ctx->mount_server.version; server->mountd_port = ctx->mount_server.port; server->mountd_protocol = ctx->mount_server.protocol; server->namelen = ctx->namlen; return 0; error: server->nfs_client = NULL; nfs_put_client(clp); return error; } /* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); if (server->dtsize > NFS_MAX_FILE_IO_SIZE) server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; if (server->flags & NFS_MOUNT_NOAC) { server->acregmin = server->acregmax = 0; server->acdirmin = server->acdirmax = 0; } server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); #ifdef CONFIG_NFS_V4_2 /* * Defaults until limited by the session parameters. */ server->gxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->sxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->lxasize = min_t(unsigned int, raw_max_rpc_payload, nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; #endif } /* * Probe filesystem information, including the FSID on v2/v3 */ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; int error; if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) return error; } fsinfo.fattr = fattr; fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) return error; nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { struct nfs_pathconf pathinfo; pathinfo.fattr = fattr; nfs_fattr_init(fattr); if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; } if (clp->rpc_ops->discover_trunking != NULL && (server->caps & NFS_CAP_FS_LOCATIONS && (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; } return 0; } /* * Grab the destination's particulars, including lease expiry time. * * Returns zero if probe succeeded and retrieved FSID matches the FSID * we have cached. */ int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fattr *fattr; int error; fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; /* Sanity: the probe won't work if the destination server * does not recognize the migrated FH. */ error = nfs_probe_fsinfo(server, mntfh, fattr); nfs_free_fattr(fattr); return error; } EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record */ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->rsize = source->rsize; target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nn->nfs_client_lock); } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; if (clp == NULL) return; nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); static DEFINE_IDA(s_sysfs_ids); /* * Allocate and initialise a server record */ struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); if (server->s_sysfs_id < 0) { kfree(server); return NULL; } server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { kfree(server); return NULL; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; init_waitqueue_head(&server->write_congestion_wait); atomic_long_set(&server->writeback, 0); atomic64_set(&server->owner_ctr, 0); pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); static void delayed_free(struct rcu_head *p) { struct nfs_server *server = container_of(p, struct nfs_server, rcu); nfs_free_iostats(server->io_stats); kfree(server); } /* * Free up a server record */ void nfs_free_server(struct nfs_server *server) { nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); if (!IS_ERR(server->client_acl)) rpc_shutdown_client(server->client_acl); if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); nfs_put_client(server->nfs_client); if (server->kobj.state_initialized) { nfs_sysfs_remove_server(server); kobject_put(&server->kobj); } ida_free(&s_sysfs_ids, server->s_sysfs_id); put_cred(server->cred); nfs_release_automount_timer(); call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ struct nfs_server *nfs_create_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; error: nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, struct nfs_fattr *fattr, rpc_authflavor_t flavor) { struct nfs_server *server; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, server->nfs_client->cl_rpcclient, "_state"); error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; error = nfs_start_lockd(server); if (error < 0) goto out_free_server; nfs_server_insert_lists(server); server->mount_time = jiffies; return server; out_free_server: nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } void nfs_clients_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_netns_sysfs_destroy(nn); nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } #ifdef CONFIG_PROC_FS static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, .show = nfs_server_list_show, }; static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, .show = nfs_volume_list_show, }; /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* * move to next server */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } /* * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); /* Check if the client is initialized */ if (clp->cl_cons_state != NFS_CS_READY) return 0; rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); return 0; } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* * move to next volume */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } /* * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID" " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, fsid, nfs_server_fscache_state(server)); rcu_read_unlock(); return 0; } int nfs_fs_proc_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct proc_dir_entry *p; nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); if (!nn->proc_nfsfs) goto error_0; /* a file of servers with which we're dealing */ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; error_1: remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { remove_proc_subtree("nfsfs", net->proc_net); } /* * initialise the /proc/fs/nfsfs/ directory */ int __init nfs_fs_proc_init(void) { if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) goto error_1; return 0; error_1: remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } /* * clean up the /proc/fs/nfsfs/ directory */ void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */ |
98 31 70 363 362 10 323 19 29 30 67 105 1219 1082 450 264 4 161 105 105 265 875 1087 18 1003 309 5 5 962 895 16 243 956 963 980 7 6 1 1 1268 182 1104 53 1073 158 68 69 2 8 59 19 134 12 2 2 10 4 129 72 1132 1255 1260 869 194 702 411 297 371 300 300 6 295 190 186 187 21 20 2 7 20 20 7 20 7 7 20 20 10 10 2 20 2 20 20 21 21 9 11 1 9 5 12 12 3 12 3 3 7 205 205 205 201 201 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 | // SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * * Generic datagram handling routines. These are generic for all * protocols. Possibly a generic IP version on top of these would * make sense. Not tonight however 8-). * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and * NetROM layer all have identical poll code and mostly * identical recvmsg() code. So we share it here. The poll was * shared before but buried in udp.c so I moved it. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old * udp.c code) * * Fixes: * Alan Cox : NULL return from skb_peek_copy() * understood * Alan Cox : Rewrote skb_read_datagram to avoid the * skb_peek_copy stuff. * Alan Cox : Added support for SOCK_SEQPACKET. * IPX can no longer use the SO_TYPE hack * but AX.25 now works right, and SPX is * feasible. * Alan Cox : Fixed write poll of non IP protocol * crash. * Florian La Roche: Changed for my new skbuff handling. * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. * Linus Torvalds : BSD semantic fixes. * Alan Cox : Datagram iovec handling * Darryl Miles : Fixed non-blocking SOCK_STREAM. * Alan Cox : POSIXisms * Pete Wyckoff : Unconnected accept() fix. * */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/poll.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/checksum.h> #include <net/sock.h> #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include <crypto/hash.h> /* * Is a socket 'connection oriented' ? */ static inline int connection_based(struct sock *sk) { return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { /* * Avoid a wakeup if event not interesting for us */ if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) return 0; return autoremove_wake_function(wait, mode, sync, key); } /* * Wait for the last received packet to be different from skb */ int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb) { int error; DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); if (error) goto out_err; if (READ_ONCE(queue->prev) != skb) goto out; /* Socket shut down? */ if (sk->sk_shutdown & RCV_SHUTDOWN) goto out_noerr; /* Sequenced packets can come disconnected. * If so we report the problem */ error = -ENOTCONN; if (connection_based(sk) && !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) goto out_err; /* handle signals */ if (signal_pending(current)) goto interrupted; error = 0; *timeo_p = schedule_timeout(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); return error; interrupted: error = sock_intr_errno(*timeo_p); out_err: *err = error; goto out; out_noerr: *err = 0; error = 1; goto out; } EXPORT_SYMBOL(__skb_wait_for_more_packets); static struct sk_buff *skb_set_peeked(struct sk_buff *skb) { struct sk_buff *nskb; if (skb->peeked) return skb; /* We have to unshare an skb before modifying it. */ if (!skb_shared(skb)) goto done; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return ERR_PTR(-ENOMEM); skb->prev->next = nskb; skb->next->prev = nskb; nskb->prev = skb->prev; nskb->next = skb->next; consume_skb(skb); skb = nskb; done: skb->peeked = 1; return skb; } struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { bool peek_at_off = false; struct sk_buff *skb; int _off = 0; if (unlikely(flags & MSG_PEEK && *off >= 0)) { peek_at_off = true; _off = *off; } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { if (peek_at_off && _off >= skb->len && (_off || skb->peeked)) { _off -= skb->len; continue; } if (!skb->len) { skb = skb_set_peeked(skb); if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } } refcount_inc(&skb->users); } else { __skb_unlink(skb, queue); } *off = _off; return skb; } return NULL; } /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @queue: socket queue from which to receive * @flags: MSG\_ flags * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts * @err: error code returned * @last: set to last peeked message to inform the wait function * what to look for when peeking * * Get a datagram skbuff, understands the peeking, nonblocking wakeups * and possible races. This replaces identical code in packet, raw and * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes * the long standing peek and read race for datagram sockets. If you * alter this routine remember it must be re-entrant. * * This function will lock the socket if a skb is returned, so * the caller needs to unlock the socket in that case (usually by * calling skb_free_datagram). Returns NULL with @err set to * -EAGAIN if no data was available or to some other value if an * error was detected. * * * It does not lock socket since today. This function is * * free of race conditions. This measure should/can improve * * significantly datagram socket latencies at high loads, * * when data copying to user space takes lots of time. * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet * * 8) Great win.) * * --ANK (980729) * * The order of the tests when we find no data waiting are specified * quite explicitly by POSIX 1003.1g, don't change them without having * the standard around please. */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) { struct sk_buff *skb; unsigned long cpu_flags; /* * Caller is allowed not to check sk->sk_err before skb_recv_datagram() */ int error = sock_error(sk); if (error) goto no_packet; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. * * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) goto no_packet; if (skb) return skb; if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN; no_packet: *err = error; return NULL; } EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err) { struct sk_buff *skb, *last; long timeo; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, &last); if (skb) return skb; if (*err != -EAGAIN) break; } while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, err, &timeo, last)); return NULL; } EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err) { int off = 0; return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); } EXPORT_SYMBOL(skb_free_datagram); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { int err = 0; if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } spin_unlock_bh(&sk_queue->lock); } atomic_inc(&sk->sk_drops); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one * used for skb_recv_datagram. * * If the MSG_PEEK flag is set, and the packet is still on the * receive queue of the socket, it will be taken off the queue * before it is freed. * * This function currently only disables BH when acquiring the * sk_receive_queue lock. Therefore it must not be used in a * context where that lock is acquired in an IRQ context. * * It returns 0 if the packet was removed by us. */ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, NULL); kfree_skb(skb); return err; } EXPORT_SYMBOL(skb_kill_datagram); INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i)); static int __skb_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, bool fault_short, size_t (*cb)(const void *, size_t, void *, struct iov_iter *), void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; n = INDIRECT_CALL_1(cb, simple_copy_to_iter, skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; if ((len -= copy) == 0) return 0; } if (!skb_frags_readable(skb)) goto short_copy; /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u32 p_off, p_len, copied; struct page *p; u8 *vaddr; if (copy > len) copy = len; n = 0; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_local_page(p); n += INDIRECT_CALL_1(cb, simple_copy_to_iter, vaddr + p_off, p_len, data, to); kunmap_local(vaddr); } offset += n; if (n != copy) goto short_copy; if (!(len -= copy)) return 0; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (__skb_datagram_iter(frag_iter, offset - start, to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; /* This is not really a user copy fault, but rather someone * gave us a bogus length on the skb. We should probably * print a warning here as it may indicate a kernel bug. */ fault: iov_iter_revert(to, offset - start_off); return -EFAULT; short_copy: if (fault_short || iov_iter_count(to)) goto fault; return 0; } static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i) { #ifdef CONFIG_CRYPTO_HASH struct ahash_request *hash = hashp; struct scatterlist sg; size_t copied; copied = copy_to_iter(addr, bytes, i); sg_init_one(&sg, addr, copied); ahash_request_set_crypt(hash, &sg, NULL, copied); crypto_ahash_update(hash); return copied; #else return 0; #endif } /** * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator * and update a hash. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @hash: hash request to update */ int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash) { return __skb_datagram_iter(skb, offset, to, len, true, hash_and_copy_to_iter, hash); } EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) { return copy_to_iter(addr, bytes, i); } /** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec */ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len) { trace_skb_copy_datagram_iovec(skb, len); return __skb_datagram_iter(skb, offset, to, len, false, simple_copy_to_iter, NULL); } EXPORT_SYMBOL(skb_copy_datagram_iter); /** * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. */ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; if (copy_from_iter(skb->data + offset, copy, from) != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { size_t copied; if (copy > len) copy = len; copied = copy_page_from_iter(skb_frag_page(frag), skb_frag_off(frag) + offset - start, copy, from); if (copied != copy) goto fault; if (!(len -= copy)) return 0; offset += copy; } start = end; } skb_walk_frags(skb, frag_iter) { int end; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { if (copy > len) copy = len; if (skb_copy_datagram_from_iter(frag_iter, offset - start, from, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; } start = end; } if (!len) return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_datagram_from_iter); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { int frag = skb_shinfo(skb)->nr_frags; if (!skb_frags_readable(skb)) return -EFAULT; while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; int refs, order, n = 0; size_t start; ssize_t copied; if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; copied = iov_iter_get_pages2(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; length -= copied; skb->data_len += copied; skb->len += copied; skb->truesize += PAGE_ALIGN(copied + start); head = compound_head(pages[n]); order = compound_order(head); for (refs = 0; copied != 0; start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); if (pages[n] - head > (1UL << order) - 1) { head = compound_head(pages[n]); order = compound_order(head); } start += (pages[n] - head) << PAGE_SHIFT; copied -= size; n++; if (frag) { skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; if (head == skb_frag_page(last) && start == skb_frag_off(last) + skb_frag_size(last)) { skb_frag_size_add(last, size); /* We combined this page, we need to release * a reference. Since compound pages refcount * is shared among many pages, batch the refcount * adjustments to limit false sharing. */ last_head = head; refs++; continue; } } if (refs) { page_ref_sub(last_head, refs); refs = 0; } skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); } return 0; } int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { unsigned long orig_size = skb->truesize; unsigned long truesize; int ret; if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); else ret = zerocopy_fill_skb_from_iter(skb, from, length); truesize = skb->truesize - orig_size; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); if (!skb_zcopy_pure(skb)) sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } return ret; } EXPORT_SYMBOL(__zerocopy_sg_from_iter); /** * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. */ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); /* copy up to skb headlen */ if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); } EXPORT_SYMBOL(zerocopy_sg_from_iter); static __always_inline size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum next, *csum = priv2; next = csum_and_copy_to_user(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return next ? 0 : len; } static __always_inline size_t memcpy_to_iter_csum(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { __wsum *csum = priv2; __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); *csum = csum_block_add(*csum, next, progress); return 0; } struct csum_state { __wsum csum; size_t off; }; static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, struct iov_iter *i) { struct csum_state *csstate = _csstate; __wsum sum; if (WARN_ON_ONCE(i->data_source)) return 0; if (unlikely(iov_iter_is_discard(i))) { // can't use csum_memcpy() for that one - data is not copied csstate->csum = csum_block_add(csstate->csum, csum_partial(addr, bytes, 0), csstate->off); csstate->off += bytes; return bytes; } sum = csum_shift(csstate->csum, csstate->off); bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, copy_to_user_iter_csum, memcpy_to_iter_csum); csstate->csum = csum_shift(sum, csstate->off); csstate->off += bytes; return bytes; } /** * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator * and update a checksum. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec * @csump: checksum pointer */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { struct csum_state csdata = { .csum = *csump }; int ret; ret = __skb_datagram_iter(skb, offset, to, len, true, csum_and_copy_to_iter, &csdata); if (ret) return ret; *csump = csdata.csum; return 0; } /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. * -EFAULT - fault during copy. */ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; if (!chunk) return 0; if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) { iov_iter_revert(&msg->msg_iter, chunk); return -EINVAL; } if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(NULL, skb); } return 0; fault: return -EFAULT; } EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll * @file: file struct * @sock: socket * @wait: poll table * * Datagram poll: Again totally generic. This also handles * sequenced packet sockets providing the socket receive queue * is only ever holding data ready to receive. * * Note: when you *don't* use this routine for this protocol, * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ __poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { int state = READ_ONCE(sk->sk_state); if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ if (state == TCP_SYN_SENT) return mask; } /* writable? */ if (sock_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } EXPORT_SYMBOL(datagram_poll); |
24 8 12 1 1 1 2 1 4 6 3 29 1 1 9 17 1 26 1 1 1 12 2 9 2 9 7 8 2 19 18 1 6 3 3 3 5 1 4 2 8 1 1 1 2 2 2 2 2 2 1 3 4 4 2 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_HDR_TYPE_MAX), [TCA_PEDIT_KEY_EX_CMD] = NLA_POLICY_MAX(NLA_U16, TCA_PEDIT_CMD_MAX), }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n, struct netlink_ext_ack *extack) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { NL_SET_ERR_MSG_MOD(extack, "Can't parse more extended keys than requested"); err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { NL_SET_ERR_MSG_ATTR(extack, ka, "Unknown attribute, expected extended key"); err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_HTYPE)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } if (NL_REQ_ATTR_CHECK(extack, nla, tb, TCA_PEDIT_KEY_EX_CMD)) { NL_SET_ERR_MSG(extack, "Missing required attribute"); err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); k++; } if (n) { NL_SET_ERR_MSG_MOD(extack, "Not enough extended keys to parse"); err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys, extack); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmemdup(parm->keys, ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 offmask = nparms->tcfp_keys[i].offmask; u32 cur = nparms->tcfp_keys[i].off; /* The AT option can be added to static offsets in the datapath */ if (!offmask && cur % 4) { NL_SET_ERR_MSG_MOD(extack, "Offsets must be on 32bit boundaries"); ret = -EINVAL; goto out_free_keys; } /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; out_free_keys: kfree(nparms->tcfp_keys); put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) { const int noff = skb_network_offset(skb); int ret = -EINVAL; struct iphdr _iph; switch (skb->protocol) { case htons(ETH_P_IP): { const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) goto out; *hoffset = noff + iph->ihl * 4; ret = 0; break; } case htons(ETH_P_IPV6): ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL; break; } out: return ret; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; /* 'htype' is validated in the netlink parsing */ switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP); break; case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP); break; default: break; } return ret; } TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); tkey = parms->tcfp_keys; tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { int offset = tkey->off; int hoffset = 0; u32 *ptr, hdata; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; if (offset % 4) { pr_info_ratelimited("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } } if (!offset_valid(skb, hoffset + offset)) { pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; bad: tcf_action_inc_overlimit_qstats(&p->common); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; int k; for (k = 0; k < tcf_pedit_nkeys(act); k++) { switch (tcf_pedit_cmd(act, k)) { case TCA_PEDIT_KEY_EX_CMD_SET: entry->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: entry->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } entry->mangle.htype = tcf_pedit_htype(act, k); entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); entry->hw_stats = tc_act_hw_stats(act->hw_stats); entry++; } *index_inc = k; } else { struct flow_offload_action *fl_action = entry_data; u32 cmd = tcf_pedit_cmd(act, 0); int k; switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: fl_action->id = FLOW_ACTION_MANGLE; break; case TCA_PEDIT_KEY_EX_CMD_ADD: fl_action->id = FLOW_ACTION_ADD; break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } for (k = 1; k < tcf_pedit_nkeys(act); k++) { if (cmd != tcf_pedit_cmd(act, k)) { NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); return -EOPNOTSUPP; } } } return 0; } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_pedit_ops.net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &act_pedit_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module); |
17 2554 2551 17 18 17 18 18 18 18 18 18 12 6 12 6 18 18 18 18 6 6 6 6 2 2 12 12 11 12 7 5 4 4 4 4 4 4 17 17 17 17 17 17 17 17 17 12 12 12 3473 3465 6 6 6 6 6 6 6 6 6 6 6 6 6 6 2555 2551 2558 2554 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 | // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { int cpu; blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); if (!blkcg->lhead) return -ENOMEM; for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); return 0; } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { struct cgroup_subsys_state *css; css = kthread_blkcg(); if (css) return css; return task_css(current, io_cgrp_id); } static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion * of the list blkg->q_node is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { if (!blkg) return; /* * Both ->pd_free_fn() and request queue's release handler may * sleep, so free us by scheduling one work func */ INIT_WORK(&blkg->free_work, blkg_free_workfn); schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg *blkcg = blkg->blkcg; int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif /* * Flush all the non-empty percpu lockless lists before releasing * us, given these stat belongs to us. * * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; if (blkg->parent) { spin_lock(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); } else { /* never bounce for the root cgroup */ submit_bio(bio); } } EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); static int __init blkcg_punt_bio_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { if (!bio || !bio->bi_blkg) return NULL; return &bio->bi_blkg->blkcg->css; } EXPORT_SYMBOL_GPL(bio_blkcg_css); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); #endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; pd->online = false; } return blkg; out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); out_free_blkg: kfree(blkg); return NULL; } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i]) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); blkg->pd[i]->online = true; } } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* * blkg stays on the queue list until blkg_free_workfn(), see details in * blkg_free_workfn(), hence this function can be called from * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before * blkg_free_workfn(). */ if (hlist_unhashed(&blkg->blkcg_node)) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && blkg->pd[i]->online) { blkg->pd[i]->online = false; if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void __blkg_clear_stat(struct blkg_iostat_set *bis) { struct blkg_iostat cur = {0}; unsigned long flags; flags = u64_stats_update_begin_irqsave(&bis->sync); blkg_iostat_set(&bis->cur, &cur); blkg_iostat_set(&bis->last, &cur); u64_stats_update_end_irqrestore(&bis->sync, flags); } static void blkg_clear_stat(struct blkcg_gq *blkg) { int cpu; for_each_possible_cpu(cpu) { struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); __blkg_clear_stat(s); } __blkg_clear_stat(&blkg->iostat); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { *ctx = (struct blkg_conf_ctx){ .input = input }; } EXPORT_SYMBOL_GPL(blkg_conf_init); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; if (ctx->bdev) return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; input += key_len; if (!isspace(*input)) return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return -ENODEV; } mutex_lock(&bdev->bd_queue->rq_qos_mutex); if (!disk_live(bdev->bd_disk)) { blkdev_put_no_open(bdev); mutex_unlock(&bdev->bd_queue->rq_qos_mutex); return -ENODEV; } ctx->body = input; ctx->bdev = bdev; return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; ret = blkg_conf_open_bdev(ctx); if (ret) return ret; disk = ctx->bdev->bd_disk; q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). */ ret = blk_queue_enter(q, 0); if (ret) goto fail; spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(blkcg, q); if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit_queue; } spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: blk_queue_exit(q); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); fail_exit_queue: blk_queue_exit(q); fail: /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ctx->blkg = NULL; } if (ctx->bdev) { mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; } } EXPORT_SYMBOL_GPL(blkg_conf_exit); static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { struct blkg_iostat delta; unsigned long flags; /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, cur); blkg_iostat_sub(&delta, last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; unsigned long flags; rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; /* * For covering concurrent parent blkg update from blkg_release(). * * When flushing from cgroup, cgroup_rstat_lock is always held, so * this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. */ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; struct blkg_iostat cur; unsigned int seq; /* * Order assignment of `next_bisc` from `bisc->lnode.next` in * llist_for_each_entry_safe and clearing `bisc->lqueued` for * avoiding to assign `next_bisc` with new next pointer added * in blk_cgroup_bio_start() in case of re-ordering. * * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). */ smp_mb(); WRITE_ONCE(bisc->lqueued, false); if (bisc == &blkg->iostat) goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); blkcg_iostat_update(blkg, &cur, &bisc->last); propagate_up: /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); /* * Queue parent->iostat to its blkcg's lockless * list to propagate up to the grandparent if the * iostat hasn't been queued yet. */ if (!parent->iostat.lqueued) { struct llist_head *plhead; plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); llist_add(&parent->iostat.lnode, plhead); parent->iostat.lqueued = true; } } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { /* Root-level stats are sourced from system-wide IO stats */ if (cgroup_parent(css->cgroup)) __blkcg_rstat_flush(css_to_blkcg(css), cpu); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, cgroup_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; pol->pd_stat_fn(blkg->pd[i], s); } seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { return &css_to_blkcg(css)->cgwb_list; } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { struct blkcg *parent; if (!refcount_dec_and_test(&blkcg->online_pin)) break; parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); blkcg = parent; } while (blkcg); } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); free_percpu(blkcg->lhead); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) goto unlock; } if (init_blkcg_llists(blkcg)) goto free_blkcg; for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto free_pd_blkcg; blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_percpu(blkcg->lhead); free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(&parent->css); return 0; } void blkg_init_queue(struct request_queue *q) { INIT_LIST_HEAD(&q->blkg_list); mutex_init(&q->blkcg_mutex); } int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return 0; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) put_disk(tsk->throttle_disk); tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; /* * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, * for example, ioprio. Such policy will work on blkcg level, not disk * level, and don't need to be activated. */ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) return -EINVAL; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } spin_lock(&blkg->blkcg->lock); pd->blkg = blkg; pd->plid = pol->plid; blkg->pd[pol->plid] = pd; if (pol->pd_init_fn) pol->pd_init_fn(pd); if (pol->pd_online_fn) pol->pd_online_fn(pd); pd->online = true; spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; spin_lock(&blkcg->lock); pd = blkg->pd[pol->plid]; if (pd) { if (pd->online && pol->pd_offline_fn) pol->pd_offline_fn(pd); pd->online = false; pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { struct blkcg *blkcg; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; } /* * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy * without pd_alloc_fn/pd_free_fn can't be activated. */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto err_free_cpds; blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes) WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!disk) return; current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); put_disk(disk); return; out: rcu_read_unlock(); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_disk != disk) { if (test_bit(GD_DEAD, &disk->state)) return; get_device(disk_to_dev(disk)); if (current->throttle_disk) put_disk(current->throttle_disk); current->throttle_disk = disk; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; if (blk_op_is_passthrough(bio->bi_opf)) return; rcu_read_lock(); if (bio->bi_blkg) css = bio_blkcg_css(bio); else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) return; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; /* * If the iostat_cpu isn't in a lockless list, put it into the * list to indicate that a stat update is pending. */ if (!READ_ONCE(bis->lqueued)) { struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); } u64_stats_update_end_irqrestore(&bis->sync, flags); cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } bool blk_cgroup_congested(void) { struct blkcg *blkcg; bool ret = false; rcu_read_lock(); for (blkcg = css_to_blkcg(blkcg_css()); blkcg; blkcg = blkcg_parent(blkcg)) { if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } } rcu_read_unlock(); return ret; } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); |
33 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "timers.h" #include "device.h" #include "peer.h" #include "queueing.h" #include "socket.h" /* * - Timer for retransmitting the handshake if we don't hear back after * `REKEY_TIMEOUT + jitter` ms. * * - Timer for sending empty packet if we have received a packet but after have * not sent one for `KEEPALIVE_TIMEOUT` ms. * * - Timer for initiating new handshake if we have sent a packet but after have * not received one (even empty) for `(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) + * jitter` ms. * * - Timer for zeroing out all ephemeral keys after `(REJECT_AFTER_TIME * 3)` ms * if no new keys have been received. * * - Timer for, if enabled, sending an empty authenticated packet every user- * specified seconds. */ static inline void mod_peer_timer(struct wg_peer *peer, struct timer_list *timer, unsigned long expires) { rcu_read_lock_bh(); if (likely(netif_running(peer->device->dev) && !READ_ONCE(peer->is_dead))) mod_timer(timer, expires); rcu_read_unlock_bh(); } static void wg_expired_retransmit_handshake(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_retransmit_handshake); if (peer->timer_handshake_attempts > MAX_TIMER_HANDSHAKES) { pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d attempts, giving up\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)MAX_TIMER_HANDSHAKES + 2); del_timer(&peer->timer_send_keepalive); /* We drop all packets without a keypair and don't try again, * if we try unsuccessfully for too long to make a handshake. */ wg_packet_purge_staged_packets(peer); /* We set a timer for destroying any residue that might be left * of a partial exchange. */ if (!timer_pending(&peer->timer_zero_key_material)) mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } else { ++peer->timer_handshake_attempts; pr_debug("%s: Handshake for peer %llu (%pISpfsc) did not complete after %d seconds, retrying (try %d)\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REKEY_TIMEOUT, peer->timer_handshake_attempts + 1); /* We clear the endpoint address src address, in case this is * the cause of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, true); } } static void wg_expired_send_keepalive(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_send_keepalive); wg_packet_send_keepalive(peer); if (peer->timer_need_another_keepalive) { peer->timer_need_another_keepalive = false; mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); } } static void wg_expired_new_handshake(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_new_handshake); pr_debug("%s: Retrying handshake with peer %llu (%pISpfsc) because we stopped hearing back after %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)(KEEPALIVE_TIMEOUT + REKEY_TIMEOUT)); /* We clear the endpoint address src address, in case this is the cause * of trouble. */ wg_socket_clear_peer_endpoint_src(peer); wg_packet_send_queued_handshake_initiation(peer, false); } static void wg_expired_zero_key_material(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_zero_key_material); rcu_read_lock_bh(); if (!READ_ONCE(peer->is_dead)) { wg_peer_get(peer); if (!queue_work(peer->device->handshake_send_wq, &peer->clear_peer_work)) /* If the work was already on the queue, we want to drop * the extra reference. */ wg_peer_put(peer); } rcu_read_unlock_bh(); } static void wg_queued_expired_zero_key_material(struct work_struct *work) { struct wg_peer *peer = container_of(work, struct wg_peer, clear_peer_work); pr_debug("%s: Zeroing out all keys for peer %llu (%pISpfsc), since we haven't received a new one in %d seconds\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr, (int)REJECT_AFTER_TIME * 3); wg_noise_handshake_clear(&peer->handshake); wg_noise_keypairs_clear(&peer->keypairs); wg_peer_put(peer); } static void wg_expired_send_persistent_keepalive(struct timer_list *timer) { struct wg_peer *peer = from_timer(peer, timer, timer_persistent_keepalive); if (likely(peer->persistent_keepalive_interval)) wg_packet_send_keepalive(peer); } /* Should be called after an authenticated data packet is sent. */ void wg_timers_data_sent(struct wg_peer *peer) { if (!timer_pending(&peer->timer_new_handshake)) mod_peer_timer(peer, &peer->timer_new_handshake, jiffies + (KEEPALIVE_TIMEOUT + REKEY_TIMEOUT) * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after an authenticated data packet is received. */ void wg_timers_data_received(struct wg_peer *peer) { if (likely(netif_running(peer->device->dev))) { if (!timer_pending(&peer->timer_send_keepalive)) mod_peer_timer(peer, &peer->timer_send_keepalive, jiffies + KEEPALIVE_TIMEOUT * HZ); else peer->timer_need_another_keepalive = true; } } /* Should be called after any type of authenticated packet is sent, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_sent(struct wg_peer *peer) { del_timer(&peer->timer_send_keepalive); } /* Should be called after any type of authenticated packet is received, whether * keepalive, data, or handshake. */ void wg_timers_any_authenticated_packet_received(struct wg_peer *peer) { del_timer(&peer->timer_new_handshake); } /* Should be called after a handshake initiation message is sent. */ void wg_timers_handshake_initiated(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_retransmit_handshake, jiffies + REKEY_TIMEOUT * HZ + get_random_u32_below(REKEY_TIMEOUT_JITTER_MAX_JIFFIES)); } /* Should be called after a handshake response message is received and processed * or when getting key confirmation via the first data message. */ void wg_timers_handshake_complete(struct wg_peer *peer) { del_timer(&peer->timer_retransmit_handshake); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; ktime_get_real_ts64(&peer->walltime_last_handshake); } /* Should be called after an ephemeral key is created, which is before sending a * handshake response or after receiving a handshake response. */ void wg_timers_session_derived(struct wg_peer *peer) { mod_peer_timer(peer, &peer->timer_zero_key_material, jiffies + REJECT_AFTER_TIME * 3 * HZ); } /* Should be called before a packet with authentication, whether * keepalive, data, or handshakem is sent, or after one is received. */ void wg_timers_any_authenticated_packet_traversal(struct wg_peer *peer) { if (peer->persistent_keepalive_interval) mod_peer_timer(peer, &peer->timer_persistent_keepalive, jiffies + peer->persistent_keepalive_interval * HZ); } void wg_timers_init(struct wg_peer *peer) { timer_setup(&peer->timer_retransmit_handshake, wg_expired_retransmit_handshake, 0); timer_setup(&peer->timer_send_keepalive, wg_expired_send_keepalive, 0); timer_setup(&peer->timer_new_handshake, wg_expired_new_handshake, 0); timer_setup(&peer->timer_zero_key_material, wg_expired_zero_key_material, 0); timer_setup(&peer->timer_persistent_keepalive, wg_expired_send_persistent_keepalive, 0); INIT_WORK(&peer->clear_peer_work, wg_queued_expired_zero_key_material); peer->timer_handshake_attempts = 0; peer->sent_lastminute_handshake = false; peer->timer_need_another_keepalive = false; } void wg_timers_stop(struct wg_peer *peer) { timer_delete_sync(&peer->timer_retransmit_handshake); timer_delete_sync(&peer->timer_send_keepalive); timer_delete_sync(&peer->timer_new_handshake); timer_delete_sync(&peer->timer_zero_key_material); timer_delete_sync(&peer->timer_persistent_keepalive); flush_work(&peer->clear_peer_work); } |
1 1 2 1 1 1 1 1 3 3 20 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2018 Oracle and/or its affiliates. All rights reserved. */ #include <crypto/aead.h> #include <linux/debugfs.h> #include <net/xfrm.h> #include "netdevsim.h" #define NSIM_IPSEC_AUTH_BITS 128 static ssize_t nsim_dbg_netdev_ops_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct netdevsim *ns = filp->private_data; struct nsim_ipsec *ipsec = &ns->ipsec; size_t bufsize; char *buf, *p; int len; int i; /* the buffer needed is * (num SAs * 3 lines each * ~60 bytes per line) + one more line */ bufsize = (ipsec->count * 4 * 60) + 60; buf = kzalloc(bufsize, GFP_KERNEL); if (!buf) return -ENOMEM; p = buf; p += scnprintf(p, bufsize - (p - buf), "SA count=%u tx=%u\n", ipsec->count, ipsec->tx); for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { struct nsim_sa *sap = &ipsec->sa[i]; if (!sap->used) continue; if (sap->xs->props.family == AF_INET6) p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI6c\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr); else p += scnprintf(p, bufsize - (p - buf), "sa[%i] %cx ipaddr=%pI4\n", i, (sap->rx ? 'r' : 't'), &sap->ipaddr[3]); p += scnprintf(p, bufsize - (p - buf), "sa[%i] spi=0x%08x proto=0x%x salt=0x%08x crypt=%d\n", i, be32_to_cpu(sap->xs->id.spi), sap->xs->id.proto, sap->salt, sap->crypt); p += scnprintf(p, bufsize - (p - buf), "sa[%i] key=0x%08x %08x %08x %08x\n", i, sap->key[0], sap->key[1], sap->key[2], sap->key[3]); } len = simple_read_from_buffer(buffer, count, ppos, buf, p - buf); kfree(buf); return len; } static const struct file_operations ipsec_dbg_fops = { .owner = THIS_MODULE, .open = simple_open, .read = nsim_dbg_netdev_ops_read, }; static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec) { u32 i; if (ipsec->count == NSIM_IPSEC_MAX_SA_COUNT) return -ENOSPC; /* search sa table */ for (i = 0; i < NSIM_IPSEC_MAX_SA_COUNT; i++) { if (!ipsec->sa[i].used) return i; } return -ENOSPC; } static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { const char aes_gcm_name[] = "rfc4106(gcm(aes))"; struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; if (!xs->aead) { netdev_err(dev, "Unsupported IPsec algorithm\n"); return -EINVAL; } if (xs->aead->alg_icv_len != NSIM_IPSEC_AUTH_BITS) { netdev_err(dev, "IPsec offload requires %d bit authentication\n", NSIM_IPSEC_AUTH_BITS); return -EINVAL; } key_data = &xs->aead->alg_key[0]; key_len = xs->aead->alg_key_len; alg_name = xs->aead->alg_name; if (strcmp(alg_name, aes_gcm_name)) { netdev_err(dev, "Unsupported IPsec algorithm - please use %s\n", aes_gcm_name); return -EINVAL; } /* 160 accounts for 16 byte key and 4 byte salt */ if (key_len > NSIM_IPSEC_AUTH_BITS) { *mysalt = ((u32 *)key_data)[4]; } else if (key_len == NSIM_IPSEC_AUTH_BITS) { *mysalt = 0; } else { netdev_err(dev, "IPsec hw offload only supports 128 bit keys with optional 32 bit salt\n"); return -EINVAL; } memcpy(mykey, key_data, 16); return 0; } static int nsim_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct nsim_ipsec *ipsec; struct net_device *dev; struct netdevsim *ns; struct nsim_sa sa; u16 sa_idx; int ret; dev = xs->xso.real_dev; ns = netdev_priv(dev); ipsec = &ns->ipsec; if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol for ipsec offload"); return -EINVAL; } if (xs->calg) { NL_SET_ERR_MSG_MOD(extack, "Compression offload not supported"); return -EINVAL; } if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { NL_SET_ERR_MSG_MOD(extack, "Unsupported ipsec offload type"); return -EINVAL; } /* find the first unused index */ ret = nsim_ipsec_find_empty_idx(ipsec); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "No space for SA in Rx table!"); return ret; } sa_idx = (u16)ret; memset(&sa, 0, sizeof(sa)); sa.used = true; sa.xs = xs; if (sa.xs->id.proto & IPPROTO_ESP) sa.crypt = xs->ealg || xs->aead; /* get the key and salt */ ret = nsim_ipsec_parse_proto_keys(xs, sa.key, &sa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for SA table"); return ret; } if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) sa.rx = true; if (xs->props.family == AF_INET6) memcpy(sa.ipaddr, &xs->id.daddr.a6, 16); else memcpy(&sa.ipaddr[3], &xs->id.daddr.a4, 4); /* the preparations worked, so save the info */ memcpy(&ipsec->sa[sa_idx], &sa, sizeof(sa)); /* the XFRM stack doesn't like offload_handle == 0, * so add a bitflag in case our array index is 0 */ xs->xso.offload_handle = sa_idx | NSIM_IPSEC_VALID; ipsec->count++; return 0; } static void nsim_ipsec_del_sa(struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; u16 sa_idx; sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (!ipsec->sa[sa_idx].used) { netdev_err(ns->netdev, "Invalid SA for delete sa_idx=%d\n", sa_idx); return; } memset(&ipsec->sa[sa_idx], 0, sizeof(struct nsim_sa)); ipsec->count--; } static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct netdevsim *ns = netdev_priv(xs->xso.real_dev); struct nsim_ipsec *ipsec = &ns->ipsec; ipsec->ok++; return true; } static const struct xfrmdev_ops nsim_xfrmdev_ops = { .xdo_dev_state_add = nsim_ipsec_add_sa, .xdo_dev_state_delete = nsim_ipsec_del_sa, .xdo_dev_offload_ok = nsim_ipsec_offload_ok, }; bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb) { struct sec_path *sp = skb_sec_path(skb); struct nsim_ipsec *ipsec = &ns->ipsec; struct xfrm_state *xs; struct nsim_sa *tsa; u32 sa_idx; /* do we even need to check this packet? */ if (!sp) return true; if (unlikely(!sp->len)) { netdev_err(ns->netdev, "no xfrm state len = %d\n", sp->len); return false; } xs = xfrm_input_state(skb); if (unlikely(!xs)) { netdev_err(ns->netdev, "no xfrm_input_state() xs = %p\n", xs); return false; } sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID; if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) { netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n", sa_idx, NSIM_IPSEC_MAX_SA_COUNT); return false; } tsa = &ipsec->sa[sa_idx]; if (unlikely(!tsa->used)) { netdev_err(ns->netdev, "unused sa_idx=%d\n", sa_idx); return false; } if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) { netdev_err(ns->netdev, "unexpected proto=%d\n", xs->id.proto); return false; } ipsec->tx++; return true; } void nsim_ipsec_init(struct netdevsim *ns) { ns->netdev->xfrmdev_ops = &nsim_xfrmdev_ops; #define NSIM_ESP_FEATURES (NETIF_F_HW_ESP | \ NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) ns->netdev->features |= NSIM_ESP_FEATURES; ns->netdev->hw_enc_features |= NSIM_ESP_FEATURES; ns->ipsec.pfile = debugfs_create_file("ipsec", 0400, ns->nsim_dev_port->ddir, ns, &ipsec_dbg_fops); } void nsim_ipsec_teardown(struct netdevsim *ns) { struct nsim_ipsec *ipsec = &ns->ipsec; if (ipsec->count) netdev_err(ns->netdev, "tearing down IPsec offload with %d SAs left\n", ipsec->count); debugfs_remove_recursive(ipsec->pfile); } |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API * * ARC4 Cipher Algorithm * * Jon Oberheide <jon@oberheide.org> */ #include <crypto/arc4.h> #include <linux/module.h> int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) { int i, j = 0, k = 0; ctx->x = 1; ctx->y = 0; for (i = 0; i < 256; i++) ctx->S[i] = i; for (i = 0; i < 256; i++) { u32 a = ctx->S[i]; j = (j + in_key[k] + a) & 0xff; ctx->S[i] = ctx->S[j]; ctx->S[j] = a; if (++k >= key_len) k = 0; } return 0; } EXPORT_SYMBOL(arc4_setkey); void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) { u32 *const S = ctx->S; u32 x, y, a, b; u32 ty, ta, tb; if (len == 0) return; x = ctx->x; y = ctx->y; a = S[x]; y = (y + a) & 0xff; b = S[y]; do { S[y] = a; a = (a + b) & 0xff; S[x] = b; x = (x + 1) & 0xff; ta = S[x]; ty = (y + ta) & 0xff; tb = S[ty]; *out++ = *in++ ^ S[a]; if (--len == 0) break; y = ty; a = ta; b = tb; } while (true); ctx->x = x; ctx->y = y; } EXPORT_SYMBOL(arc4_crypt); MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); |
6 8 8 7 1 6 4 4 2 2 3 1 1 1 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project */ /* * Authors: * Noriaki TAKAMIYA @USAGI * Masahide NAKAMURA @USAGI */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/time.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <net/mip6.h> static inline unsigned int calc_padlen(unsigned int len, unsigned int n) { return (n - len + 16) & 0x7; } static inline void *mip6_padn(__u8 *data, __u8 padlen) { if (!data) return NULL; if (padlen == 1) { data[0] = IPV6_TLV_PAD1; } else if (padlen > 1) { data[0] = IPV6_TLV_PADN; data[1] = padlen - 2; if (padlen > 2) memset(data+2, 0, data[1]); } return data + padlen; } static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); } static int mip6_mh_len(int type) { int len = 0; switch (type) { case IP6_MH_TYPE_BRR: len = 0; break; case IP6_MH_TYPE_HOTI: case IP6_MH_TYPE_COTI: case IP6_MH_TYPE_BU: case IP6_MH_TYPE_BACK: len = 1; break; case IP6_MH_TYPE_HOT: case IP6_MH_TYPE_COT: case IP6_MH_TYPE_BERROR: len = 2; break; } return len; } static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { struct ip6_mh _hdr; const struct ip6_mh *mh; mh = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!mh) return -1; if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + skb_network_header_len(skb)); return -1; } return 0; } struct mip6_report_rate_limiter { spinlock_t lock; ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; }; static struct mip6_report_rate_limiter mip6_report_rl = { .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) }; static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; int err = destopt->nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Destination Option Header is inserted. * IP Header's src address is replaced with Home Address Option in * Destination Option Header. */ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct ipv6_destopt_hdr *dstopt; struct ipv6_destopt_hao *hao; u8 nexthdr; int len; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_DSTOPTS; dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); dstopt->nexthdr = nexthdr; hao = mip6_padn((char *)(dstopt + 1), calc_padlen(sizeof(*dstopt), 6)); hao->type = IPV6_TLV_HAO; BUILD_BUG_ON(sizeof(*hao) != 18); hao->length = sizeof(*hao) - 2; len = ((char *)hao - (char *)dstopt) + sizeof(*hao); memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr)); spin_lock_bh(&x->lock); memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); spin_unlock_bh(&x->lock); WARN_ON(len != x->props.header_len); dstopt->hdrlen = (x->props.header_len >> 3) - 1; return 0; } static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); if (mip6_report_rl.stamp != stamp || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); return allow; } static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, const struct flowi *fl) { struct net *net = xs_net(x); struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; const struct flowi6 *fl6 = &fl->u.ip6; struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) goto out; if (likely(opt->dsthao)) { offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(offset >= 0)) hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + offset); } stamp = skb_get_ktime(skb); if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; memset(&sel, 0, sizeof(sel)); memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, sizeof(sel.daddr)); sel.prefixlen_d = 128; memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; sel.proto = fl6->flowi6_proto; sel.dport = xfrm_flowi_dport(fl, &fl6->uli); if (sel.dport) sel.dport_mask = htons(~0); sel.sport = xfrm_flowi_sport(fl, &fl6->uli); if (sel.sport) sel.sport_mask = htons(~0); sel.ifindex = fl6->flowi6_oif; err = km_report(net, IPPROTO_DSTOPTS, &sel, (hao ? (xfrm_address_t *)&hao->addr : NULL)); out: return err; } static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6_destopt_hdr) + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + sizeof(struct ipv6_destopt_hao); WARN_ON(x->props.header_len != 24); return 0; } /* * Do nothing about destroying since it has no specific operation for * destination options header unlike IPsec protocols. */ static void mip6_destopt_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_destopt_type = { .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Routing Header type 2 is inserted. * IP Header's dst address is replaced with Routing Header's Home Address. */ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct rt2_hdr *rt2; u8 nexthdr; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_ROUTING; rt2 = (struct rt2_hdr *)skb_transport_header(skb); rt2->rt_hdr.nexthdr = nexthdr; rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; rt2->rt_hdr.segments_left = 1; memset(&rt2->reserved, 0, sizeof(rt2->reserved)); WARN_ON(rt2->rt_hdr.hdrlen != 2); memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); spin_lock_bh(&x->lock); memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr)); spin_unlock_bh(&x->lock); return 0; } static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct rt2_hdr); return 0; } /* * Do nothing about destroying since it has no specific operation for routing * header type 2 unlike IPsec protocols. */ static void mip6_rthdr_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_rthdr_type = { .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, }; static int __init mip6_init(void) { pr_info("Mobile IPv6\n"); if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(destopt)\n", __func__); goto mip6_destopt_xfrm_fail; } if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(rthdr)\n", __func__); goto mip6_rthdr_xfrm_fail; } if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { pr_info("%s: can't add rawv6 mh filter\n", __func__); goto mip6_rawv6_mh_fail; } return 0; mip6_rawv6_mh_fail: xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); mip6_rthdr_xfrm_fail: xfrm_unregister_type(&mip6_destopt_type, AF_INET6); mip6_destopt_xfrm_fail: return -EAGAIN; } static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); module_exit(mip6_fini); MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); |
18 17 3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 | /* SPDX-License-Identifier: GPL-2.0-only */ /* L2TP internal definitions. * * Copyright (c) 2008,2009 Katalix Systems Ltd */ #include <linux/refcount.h> #ifndef _L2TP_CORE_H_ #define _L2TP_CORE_H_ #include <net/dst.h> #include <net/sock.h> #ifdef CONFIG_XFRM #include <net/xfrm.h> #endif /* Random numbers used for internal consistency checks of tunnel and session structures */ #define L2TP_SESSION_MAGIC 0x0C04EB7D struct sk_buff; struct l2tp_stats { atomic_long_t tx_packets; atomic_long_t tx_bytes; atomic_long_t tx_errors; atomic_long_t rx_packets; atomic_long_t rx_bytes; atomic_long_t rx_seq_discards; atomic_long_t rx_oos_packets; atomic_long_t rx_errors; atomic_long_t rx_cookie_discards; atomic_long_t rx_invalid; }; struct l2tp_tunnel; /* L2TP session configuration */ struct l2tp_session_cfg { enum l2tp_pwtype pw_type; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ u8 peer_cookie[8]; /* peer's cookie */ int peer_cookie_len; /* 0, 4 or 8 bytes */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ char *ifname; }; struct l2tp_session_coll_list { spinlock_t lock; /* for access to list */ struct list_head list; refcount_t ref_count; }; /* Represents a session (pseudowire) instance. * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics. * Is linked into a per-tunnel session list and a per-net ("global") IDR tree. */ #define L2TP_SESSION_NAME_MAX 32 struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; struct rcu_head rcu; struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; u32 peer_session_id; u8 cookie[8]; int cookie_len; u8 peer_cookie[8]; int peer_cookie_len; u16 l2specific_type; u16 hdr_len; u32 nr; /* session NR state (receive) */ u32 ns; /* session NR state (send) */ struct sk_buff_head reorder_q; /* receive reorder queue */ u32 nr_max; /* max NR. Depends on tunnel */ u32 nr_window_size; /* NR window size */ u32 nr_oos; /* NR of last OOS packet */ int nr_oos_count; /* for OOS recovery */ int nr_oos_count_max; struct list_head list; /* per-tunnel list node */ refcount_t ref_count; struct hlist_node hlist; /* per-net session hlist */ unsigned long hlist_key; /* key for session hlist */ struct l2tp_session_coll_list *coll_list; /* session collision list */ struct list_head clist; /* for coll_list */ char name[L2TP_SESSION_NAME_MAX]; /* for logging */ char ifname[IFNAMSIZ]; unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */ unsigned int send_seq:1; /* send packets with sequence numbers? */ unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; struct work_struct del_work; /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to * handle incoming packets. Packets are passed to the pseudowire handler after * reordering, if data sequence numbers are enabled for the session. */ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len); /* Session close handler. * Each pseudowire implementation may implement this callback in order to carry * out pseudowire-specific shutdown actions. * The callback is called by core after unlisting the session and purging its * reorder queue. */ void (*session_close)(struct l2tp_session *session); /* Session show handler. * Pseudowire-specific implementation of debugfs session rendering. * The callback is called by l2tp_debugfs.c after rendering core session * information. */ void (*show)(struct seq_file *m, void *priv); u8 priv[]; /* private data */ }; /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ struct in_addr local_ip; struct in_addr peer_ip; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr *local_ip6; struct in6_addr *peer_ip6; #endif u16 local_udp_port; u16 peer_udp_port; unsigned int use_udp_checksums:1, udp6_zero_tx_checksums:1, udp6_zero_rx_checksums:1; }; /* Represents a tunnel instance. * Tracks runtime state including IO statistics. * Holds the tunnel socket (either passed from userspace or directly created by the kernel). * Maintains a list of sessions belonging to the tunnel instance. * Is linked into a per-net list of tunnels. */ #define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { unsigned long dead; struct rcu_head rcu; spinlock_t list_lock; /* write-protection for session_list */ bool acpt_newsess; /* indicates whether this tunnel accepts * new sessions. Protected by list_lock. */ struct list_head session_list; /* list of sessions */ u32 tunnel_id; u32 peer_tunnel_id; int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ enum l2tp_encap_type encap; struct l2tp_stats stats; struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; struct sock *sock; /* parent socket */ int fd; /* parent fd, if tunnel socket was created * by userspace */ struct work_struct del_work; }; /* Pseudowire ops callbacks for use with the l2tp genetlink interface */ struct l2tp_nl_cmd_ops { /* The pseudowire session create callback is responsible for creating a session * instance for a specific pseudowire type. * It must call l2tp_session_create and l2tp_session_register to register the * session instance, as well as carry out any pseudowire-specific initialisation. * It must return >= 0 on success, or an appropriate negative errno value on failure. */ int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); /* The pseudowire session delete callback is responsible for initiating the deletion * of a session instance. * It must call l2tp_session_delete, as well as carry out any pseudowire-specific * teardown actions. */ void (*session_delete)(struct l2tp_session *session); }; static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; } /* Tunnel and session refcounts */ void l2tp_tunnel_put(struct l2tp_tunnel *tunnel); void l2tp_session_put(struct l2tp_session *session); /* Tunnel and session lookup. * These functions take a reference on the instances they return, so * the caller must ensure that the reference is dropped appropriately. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key); struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id); struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id); struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, u32 session_id); struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, unsigned long *key); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); /* Tunnel and session lifetime management. * Creation of a new instance is a two-step process: create, then register. * Destruction is triggered using the *_delete functions, and completes asynchronously. */ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg); int l2tp_session_register(struct l2tp_session *session, struct l2tp_tunnel *tunnel); void l2tp_session_delete(struct l2tp_session *session); /* Receive path helpers. If data sequencing is enabled for the session these * functions handle queuing and reordering prior to passing packets to the * pseudowire code to be passed to userspace. */ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ void l2tp_session_set_header_len(struct l2tp_session *session, int version, enum l2tp_encap_type encap); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. * Pseudowires should register with l2tp core on module init, and unregister * on module exit. */ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, int *karg); struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { switch (session->l2specific_type) { case L2TP_L2SPECTYPE_DEFAULT: return 4; case L2TP_L2SPECTYPE_NONE: default: return 0; } } static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel) { struct dst_entry *dst; u32 mtu; dst = sk_dst_get(tunnel->sock); if (!dst) return 0; mtu = dst_mtu(dst); dst_release(dst); return mtu; } #ifdef CONFIG_XFRM static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; return sk && (rcu_access_pointer(sk->sk_policy[0]) || rcu_access_pointer(sk->sk_policy[1])); } #else static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel) { return false; } #endif static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb, unsigned char **ptr, unsigned char **optr) { int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session); if (opt_len > 0) { int off = *ptr - *optr; if (!pskb_may_pull(skb, off + opt_len)) return -1; if (skb->data != *optr) { *optr = skb->data; *ptr = skb->data + off; } } return 0; } #define MODULE_ALIAS_L2TP_PWTYPE(type) \ MODULE_ALIAS("net-l2tp-type-" __stringify(type)) #endif /* _L2TP_CORE_H_ */ |
9 9 9 11 9 11 11 9 11 9 9 8 1 9 9 9 9 9 9 9 9 6 9 5 8 32 31 30 15 17 17 182 179 179 4 178 179 171 13 11 23 23 3 166 163 166 43 45 41 7 44 39 5 3 15 41 3 3 151 3 168 168 166 166 17 2 15 7 7 7 5 7 7 7 7 7 7 6 6 1 3 4 4 4 3 4 3 55 55 55 4 55 13 163 155 163 13 13 13 13 1 1 13 13 13 13 13 9 9 1 9 9 9 1 1 1 5 5 16 16 13 13 9 5 151 151 151 151 151 9 9 9 9 9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/feat.c * * Feature negotiation for the DCCP protocol (RFC 4340, section 6) * * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> * Rewrote from scratch, some bits from earlier code by * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> * * ASSUMPTIONS * ----------- * o Feature negotiation is coordinated with connection setup (as in TCP), wild * changes of parameters of an established connection are not supported. * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. */ #include <linux/module.h> #include <linux/slab.h> #include "ccid.h" #include "feat.h" /* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ unsigned long sysctl_dccp_sequence_window __read_mostly = 100; int sysctl_dccp_rx_ccid __read_mostly = 2, sysctl_dccp_tx_ccid __read_mostly = 2; /* * Feature activation handlers. * * These all use an u64 argument, to provide enough room for NN/SP features. At * this stage the negotiated values have been checked to be within their range. */ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) { struct dccp_sock *dp = dccp_sk(sk); struct ccid *new_ccid = ccid_new(ccid, sk, rx); if (new_ccid == NULL) return -ENOMEM; if (rx) { ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = new_ccid; } else { ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = new_ccid; } return 0; } static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) { struct dccp_sock *dp = dccp_sk(sk); if (rx) { dp->dccps_r_seq_win = seq_win; /* propagate changes to update SWL/SWH */ dccp_update_gsr(sk, dp->dccps_gsr); } else { dp->dccps_l_seq_win = seq_win; /* propagate changes to update AWL */ dccp_update_gss(sk, dp->dccps_gss); } return 0; } static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) { if (rx) dccp_sk(sk)->dccps_r_ack_ratio = ratio; else dccp_sk(sk)->dccps_l_ack_ratio = ratio; return 0; } static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) { struct dccp_sock *dp = dccp_sk(sk); if (rx) { if (enable && dp->dccps_hc_rx_ackvec == NULL) { dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); if (dp->dccps_hc_rx_ackvec == NULL) return -ENOMEM; } else if (!enable) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } } return 0; } static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) { if (!rx) dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } /* * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that * `rx' holds when the sending peer informs about his partial coverage via a * ChangeR() option. In the other case, we are the sender and the receiver * announces its coverage via ChangeL() options. The policy here is to honour * such communication by enabling the corresponding partial coverage - but only * if it has not been set manually before; the warning here means that all * packets will be dropped. */ static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) { struct dccp_sock *dp = dccp_sk(sk); if (rx) dp->dccps_pcrlen = cscov; else { if (dp->dccps_pcslen == 0) dp->dccps_pcslen = cscov; else if (cscov > dp->dccps_pcslen) DCCP_WARN("CsCov %u too small, peer requires >= %u\n", dp->dccps_pcslen, (u8)cscov); } return 0; } static const struct { u8 feat_num; /* DCCPF_xxx */ enum dccp_feat_type rxtx; /* RX or TX */ enum dccp_feat_type reconciliation; /* SP or NN */ u8 default_value; /* as in 6.4 */ int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); /* * Lookup table for location and type of features (from RFC 4340/4342) * +--------------------------+----+-----+----+----+---------+-----------+ * | Feature | Location | Reconc. | Initial | Section | * | | RX | TX | SP | NN | Value | Reference | * +--------------------------+----+-----+----+----+---------+-----------+ * | DCCPF_CCID | | X | X | | 2 | 10 | * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | * +--------------------------+----+-----+----+----+---------+-----------+ */ } dccp_feat_table[] = { { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, }; #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) /** * dccp_feat_index - Hash function to map feature number into array position * @feat_num: feature to hash, one of %dccp_feature_numbers * * Returns consecutive array index or -1 if the feature is not understood. */ static int dccp_feat_index(u8 feat_num) { /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) return feat_num - 1; /* * Other features: add cases for new feature types here after adding * them to the above table. */ switch (feat_num) { case DCCPF_SEND_LEV_RATE: return DCCP_FEAT_SUPPORTED_MAX - 1; } return -1; } static u8 dccp_feat_type(u8 feat_num) { int idx = dccp_feat_index(feat_num); if (idx < 0) return FEAT_UNKNOWN; return dccp_feat_table[idx].reconciliation; } static int dccp_feat_default_value(u8 feat_num) { int idx = dccp_feat_index(feat_num); /* * There are no default values for unknown features, so encountering a * negative index here indicates a serious problem somewhere else. */ DCCP_BUG_ON(idx < 0); return idx < 0 ? 0 : dccp_feat_table[idx].default_value; } /* * Debugging and verbose-printing section */ static const char *dccp_feat_fname(const u8 feat) { static const char *const feature_names[] = { [DCCPF_RESERVED] = "Reserved", [DCCPF_CCID] = "CCID", [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", [DCCPF_ECN_INCAPABLE] = "ECN Incapable", [DCCPF_ACK_RATIO] = "Ack Ratio", [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", }; if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) return feature_names[DCCPF_RESERVED]; if (feat == DCCPF_SEND_LEV_RATE) return "Send Loss Event Rate"; if (feat >= DCCPF_MIN_CCID_SPECIFIC) return "CCID-specific"; return feature_names[feat]; } static const char *const dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", }; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_feat_oname(const u8 opt) { switch (opt) { case DCCPO_CHANGE_L: return "Change_L"; case DCCPO_CONFIRM_L: return "Confirm_L"; case DCCPO_CHANGE_R: return "Change_R"; case DCCPO_CONFIRM_R: return "Confirm_R"; } return NULL; } static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) { u8 i, type = dccp_feat_type(feat_num); if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) dccp_pr_debug_cat("(NULL)"); else if (type == FEAT_SP) for (i = 0; i < val->sp.len; i++) dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); else if (type == FEAT_NN) dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); else dccp_pr_debug_cat("unknown type %u", type); } static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) { u8 type = dccp_feat_type(feat_num); dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; if (type == FEAT_NN) fval.nn = dccp_decode_value_var(list, len); dccp_feat_printval(feat_num, &fval); } static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) { dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", dccp_feat_fname(entry->feat_num)); dccp_feat_printval(entry->feat_num, &entry->val); dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], entry->needs_confirm ? "(Confirm pending)" : ""); } #define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ dccp_feat_printvals(feat, val, len); \ dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) #define dccp_feat_print_fnlist(fn_list) { \ const struct dccp_feat_entry *___entry; \ \ dccp_pr_debug("List Dump:\n"); \ list_for_each_entry(___entry, fn_list, node) \ dccp_feat_print_entry(___entry); \ } #else /* ! CONFIG_IP_DCCP_DEBUG */ #define dccp_feat_print_opt(opt, feat, val, len, mandatory) #define dccp_feat_print_fnlist(fn_list) #endif static int __dccp_feat_activate(struct sock *sk, const int idx, const bool is_local, dccp_feat_val const *fval) { bool rx; u64 val; if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) return -1; if (dccp_feat_table[idx].activation_hdlr == NULL) return 0; if (fval == NULL) { val = dccp_feat_table[idx].default_value; } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { if (fval->sp.vec == NULL) { /* * This can happen when an empty Confirm is sent * for an SP (i.e. known) feature. In this case * we would be using the default anyway. */ DCCP_CRIT("Feature #%d undefined: using default", idx); val = dccp_feat_table[idx].default_value; } else { val = fval->sp.vec[0]; } } else { val = fval->nn; } /* Location is RX if this is a local-RX or remote-TX feature */ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", dccp_feat_fname(dccp_feat_table[idx].feat_num), fval ? "" : "default ", (unsigned long long)val); return dccp_feat_table[idx].activation_hdlr(sk, val, rx); } /** * dccp_feat_activate - Activate feature value on socket * @sk: fully connected DCCP socket (after handshake is complete) * @feat_num: feature to activate, one of %dccp_feature_numbers * @local: whether local (1) or remote (0) @feat_num is meant * @fval: the value (SP or NN) to activate, or NULL to use the default value * * For general use this function is preferable over __dccp_feat_activate(). */ static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, dccp_feat_val const *fval) { return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); } /* Test for "Req'd" feature (RFC 4340, 6.4) */ static inline int dccp_feat_must_be_understood(u8 feat_num) { return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || feat_num == DCCPF_SEQUENCE_WINDOW; } /* copy constructor, fval must not already contain allocated memory */ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) { fval->sp.len = len; if (fval->sp.len > 0) { fval->sp.vec = kmemdup(val, len, gfp_any()); if (fval->sp.vec == NULL) { fval->sp.len = 0; return -ENOMEM; } } return 0; } static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) { if (unlikely(val == NULL)) return; if (dccp_feat_type(feat_num) == FEAT_SP) kfree(val->sp.vec); memset(val, 0, sizeof(*val)); } static struct dccp_feat_entry * dccp_feat_clone_entry(struct dccp_feat_entry const *original) { struct dccp_feat_entry *new; u8 type = dccp_feat_type(original->feat_num); if (type == FEAT_UNKNOWN) return NULL; new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); if (new == NULL) return NULL; if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, original->val.sp.vec, original->val.sp.len)) { kfree(new); return NULL; } return new; } static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) { if (entry != NULL) { dccp_feat_val_destructor(entry->feat_num, &entry->val); kfree(entry); } } /* * List management functions * * Feature negotiation lists rely on and maintain the following invariants: * - each feat_num in the list is known, i.e. we know its type and default value * - each feat_num/is_local combination is unique (old entries are overwritten) * - SP values are always freshly allocated * - list is sorted in increasing order of feature number (faster lookup) */ static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, u8 feat_num, bool is_local) { struct dccp_feat_entry *entry; list_for_each_entry(entry, fn_list, node) { if (entry->feat_num == feat_num && entry->is_local == is_local) return entry; else if (entry->feat_num > feat_num) break; } return NULL; } /** * dccp_feat_entry_new - Central list update routine (called by all others) * @head: list to add to * @feat: feature number * @local: whether the local (1) or remote feature with number @feat is meant * * This is the only constructor and serves to ensure the above invariants. */ static struct dccp_feat_entry * dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) { struct dccp_feat_entry *entry; list_for_each_entry(entry, head, node) if (entry->feat_num == feat && entry->is_local == local) { dccp_feat_val_destructor(entry->feat_num, &entry->val); return entry; } else if (entry->feat_num > feat) { head = &entry->node; break; } entry = kmalloc(sizeof(*entry), gfp_any()); if (entry != NULL) { entry->feat_num = feat; entry->is_local = local; list_add_tail(&entry->node, head); } return entry; } /** * dccp_feat_push_change - Add/overwrite a Change option in the list * @fn_list: feature-negotiation list to update * @feat: one of %dccp_feature_numbers * @local: whether local (1) or remote (0) @feat_num is meant * @mandatory: whether to use Mandatory feature negotiation options * @fval: pointer to NN/SP value to be inserted (will be copied) */ static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, u8 mandatory, dccp_feat_val *fval) { struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); if (new == NULL) return -ENOMEM; new->feat_num = feat; new->is_local = local; new->state = FEAT_INITIALISING; new->needs_confirm = false; new->empty_confirm = false; new->val = *fval; new->needs_mandatory = mandatory; return 0; } /** * dccp_feat_push_confirm - Add a Confirm entry to the FN list * @fn_list: feature-negotiation list to add to * @feat: one of %dccp_feature_numbers * @local: whether local (1) or remote (0) @feat_num is being confirmed * @fval: pointer to NN/SP value to be inserted or NULL * * Returns 0 on success, a Reset code for further processing otherwise. */ static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, dccp_feat_val *fval) { struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); if (new == NULL) return DCCP_RESET_CODE_TOO_BUSY; new->feat_num = feat; new->is_local = local; new->state = FEAT_STABLE; /* transition in 6.6.2 */ new->needs_confirm = true; new->empty_confirm = (fval == NULL); new->val.nn = 0; /* zeroes the whole structure */ if (!new->empty_confirm) new->val = *fval; new->needs_mandatory = false; return 0; } static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) { return dccp_feat_push_confirm(fn_list, feat, local, NULL); } static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) { list_del(&entry->node); dccp_feat_entry_destructor(entry); } void dccp_feat_list_purge(struct list_head *fn_list) { struct dccp_feat_entry *entry, *next; list_for_each_entry_safe(entry, next, fn_list, node) dccp_feat_entry_destructor(entry); INIT_LIST_HEAD(fn_list); } EXPORT_SYMBOL_GPL(dccp_feat_list_purge); /* generate @to as full clone of @from - @to must not contain any nodes */ int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) { struct dccp_feat_entry *entry, *new; INIT_LIST_HEAD(to); list_for_each_entry(entry, from, node) { new = dccp_feat_clone_entry(entry); if (new == NULL) goto cloning_failed; list_add_tail(&new->node, to); } return 0; cloning_failed: dccp_feat_list_purge(to); return -ENOMEM; } /** * dccp_feat_valid_nn_length - Enforce length constraints on NN options * @feat_num: feature to return length of, one of %dccp_feature_numbers * * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, * incoming options are accepted as long as their values are valid. */ static u8 dccp_feat_valid_nn_length(u8 feat_num) { if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ return 2; if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ return 6; return 0; } static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) { switch (feat_num) { case DCCPF_ACK_RATIO: return val <= DCCPF_ACK_RATIO_MAX; case DCCPF_SEQUENCE_WINDOW: return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; } return 0; /* feature unknown - so we can't tell */ } /* check that SP values are within the ranges defined in RFC 4340 */ static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) { switch (feat_num) { case DCCPF_CCID: return val == DCCPC_CCID2 || val == DCCPC_CCID3; /* Type-check Boolean feature values: */ case DCCPF_SHORT_SEQNOS: case DCCPF_ECN_INCAPABLE: case DCCPF_SEND_ACK_VECTOR: case DCCPF_SEND_NDP_COUNT: case DCCPF_DATA_CHECKSUM: case DCCPF_SEND_LEV_RATE: return val < 2; case DCCPF_MIN_CSUM_COVER: return val < 16; } return 0; /* feature unknown */ } static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) { if (sp_list == NULL || sp_len < 1) return 0; while (sp_len--) if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) return 0; return 1; } /** * dccp_feat_insert_opts - Generate FN options from current list state * @skb: next sk_buff to be sent to the peer * @dp: for client during handshake and general negotiation * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) */ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, struct sk_buff *skb) { struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; struct dccp_feat_entry *pos, *next; u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; bool rpt; /* put entries into @skb in the order they appear in the list */ list_for_each_entry_safe_reverse(pos, next, fn, node) { opt = dccp_feat_genopt(pos); type = dccp_feat_type(pos->feat_num); rpt = false; if (pos->empty_confirm) { len = 0; ptr = NULL; } else { if (type == FEAT_SP) { len = pos->val.sp.len; ptr = pos->val.sp.vec; rpt = pos->needs_confirm; } else if (type == FEAT_NN) { len = dccp_feat_valid_nn_length(pos->feat_num); ptr = nn_in_nbo; dccp_encode_value_var(pos->val.nn, ptr, len); } else { DCCP_BUG("unknown feature %u", pos->feat_num); return -1; } } dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) return -1; if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) return -1; if (skb->sk->sk_state == DCCP_OPEN && (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { /* * Confirms don't get retransmitted (6.6.3) once the * connection is in state OPEN */ dccp_feat_list_pop(pos); } else { /* * Enter CHANGING after transmitting the Change * option (6.6.2). */ if (pos->state == FEAT_INITIALISING) pos->state = FEAT_CHANGING; } } return 0; } /** * __feat_register_nn - Register new NN value on socket * @fn: feature-negotiation list to register with * @feat: an NN feature from %dccp_feature_numbers * @mandatory: use Mandatory option if 1 * @nn_val: value to register (restricted to 4 bytes) * * Note that NN features are local by definition (RFC 4340, 6.3.2). */ static int __feat_register_nn(struct list_head *fn, u8 feat, u8 mandatory, u64 nn_val) { dccp_feat_val fval = { .nn = nn_val }; if (dccp_feat_type(feat) != FEAT_NN || !dccp_feat_is_valid_nn_val(feat, nn_val)) return -EINVAL; /* Don't bother with default values, they will be activated anyway. */ if (nn_val - (u64)dccp_feat_default_value(feat) == 0) return 0; return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); } /** * __feat_register_sp - Register new SP value/list on socket * @fn: feature-negotiation list to register with * @feat: an SP feature from %dccp_feature_numbers * @is_local: whether the local (1) or the remote (0) @feat is meant * @mandatory: use Mandatory option if 1 * @sp_val: SP value followed by optional preference list * @sp_len: length of @sp_val in bytes */ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, u8 mandatory, u8 const *sp_val, u8 sp_len) { dccp_feat_val fval; if (dccp_feat_type(feat) != FEAT_SP || !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) return -EINVAL; /* Avoid negotiating alien CCIDs by only advertising supported ones */ if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) return -EOPNOTSUPP; if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { kfree(fval.sp.vec); return -ENOMEM; } return 0; } /** * dccp_feat_register_sp - Register requests to change SP feature values * @sk: client or listening socket * @feat: one of %dccp_feature_numbers * @is_local: whether the local (1) or remote (0) @feat is meant * @list: array of preferred values, in descending order of preference * @len: length of @list in bytes */ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len) { /* any changes must be registered before establishing the connection */ if (sk->sk_state != DCCP_CLOSED) return -EISCONN; if (dccp_feat_type(feat) != FEAT_SP) return -EINVAL; return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, 0, list, len); } /** * dccp_feat_nn_get - Query current/pending value of NN feature * @sk: DCCP socket of an established connection * @feat: NN feature number from %dccp_feature_numbers * * For a known NN feature, returns value currently being negotiated, or * current (confirmed) value if no negotiation is going on. */ u64 dccp_feat_nn_get(struct sock *sk, u8 feat) { if (dccp_feat_type(feat) == FEAT_NN) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_feat_entry *entry; entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); if (entry != NULL) return entry->val.nn; switch (feat) { case DCCPF_ACK_RATIO: return dp->dccps_l_ack_ratio; case DCCPF_SEQUENCE_WINDOW: return dp->dccps_l_seq_win; } } DCCP_BUG("attempt to look up unsupported feature %u", feat); return 0; } EXPORT_SYMBOL_GPL(dccp_feat_nn_get); /** * dccp_feat_signal_nn_change - Update NN values for an established connection * @sk: DCCP socket of an established connection * @feat: NN feature number from %dccp_feature_numbers * @nn_val: the new value to use * * This function is used to communicate NN updates out-of-band. */ int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) { struct list_head *fn = &dccp_sk(sk)->dccps_featneg; dccp_feat_val fval = { .nn = nn_val }; struct dccp_feat_entry *entry; if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) return 0; if (dccp_feat_type(feat) != FEAT_NN || !dccp_feat_is_valid_nn_val(feat, nn_val)) return -EINVAL; if (nn_val == dccp_feat_nn_get(sk, feat)) return 0; /* already set or negotiation under way */ entry = dccp_feat_list_lookup(fn, feat, 1); if (entry != NULL) { dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", (unsigned long long)entry->val.nn, (unsigned long long)nn_val); dccp_feat_list_pop(entry); } inet_csk_schedule_ack(sk); return dccp_feat_push_change(fn, feat, 1, 0, &fval); } EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); /* * Tracking features whose value depend on the choice of CCID * * This is designed with an extension in mind so that a list walk could be done * before activating any features. However, the existing framework was found to * work satisfactorily up until now, the automatic verification is left open. * When adding new CCIDs, add a corresponding dependency table here. */ static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) { static const struct ccid_dependency ccid2_dependencies[2][2] = { /* * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX * feature and Send Ack Vector is an RX feature, `is_local' * needs to be reversed. */ { /* Dependencies of the receiver-side (remote) CCID2 */ { .dependent_feat = DCCPF_SEND_ACK_VECTOR, .is_local = true, .is_mandatory = true, .val = 1 }, { 0, 0, 0, 0 } }, { /* Dependencies of the sender-side (local) CCID2 */ { .dependent_feat = DCCPF_SEND_ACK_VECTOR, .is_local = false, .is_mandatory = true, .val = 1 }, { 0, 0, 0, 0 } } }; static const struct ccid_dependency ccid3_dependencies[2][5] = { { /* * Dependencies of the receiver-side CCID3 */ { /* locally disable Ack Vectors */ .dependent_feat = DCCPF_SEND_ACK_VECTOR, .is_local = true, .is_mandatory = false, .val = 0 }, { /* see below why Send Loss Event Rate is on */ .dependent_feat = DCCPF_SEND_LEV_RATE, .is_local = true, .is_mandatory = true, .val = 1 }, { /* NDP Count is needed as per RFC 4342, 6.1.1 */ .dependent_feat = DCCPF_SEND_NDP_COUNT, .is_local = false, .is_mandatory = true, .val = 1 }, { 0, 0, 0, 0 }, }, { /* * CCID3 at the TX side: we request that the HC-receiver * will not send Ack Vectors (they will be ignored, so * Mandatory is not set); we enable Send Loss Event Rate * (Mandatory since the implementation does not support * the Loss Intervals option of RFC 4342, 8.6). * The last two options are for peer's information only. */ { .dependent_feat = DCCPF_SEND_ACK_VECTOR, .is_local = false, .is_mandatory = false, .val = 0 }, { .dependent_feat = DCCPF_SEND_LEV_RATE, .is_local = false, .is_mandatory = true, .val = 1 }, { /* this CCID does not support Ack Ratio */ .dependent_feat = DCCPF_ACK_RATIO, .is_local = true, .is_mandatory = false, .val = 0 }, { /* tell receiver we are sending NDP counts */ .dependent_feat = DCCPF_SEND_NDP_COUNT, .is_local = true, .is_mandatory = false, .val = 1 }, { 0, 0, 0, 0 } } }; switch (ccid) { case DCCPC_CCID2: return ccid2_dependencies[is_local]; case DCCPC_CCID3: return ccid3_dependencies[is_local]; default: return NULL; } } /** * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID * @fn: feature-negotiation list to update * @id: CCID number to track * @is_local: whether TX CCID (1) or RX CCID (0) is meant * * This function needs to be called after registering all other features. */ static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) { const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); int i, rc = (table == NULL); for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) rc = __feat_register_sp(fn, table[i].dependent_feat, table[i].is_local, table[i].is_mandatory, &table[i].val, 1); else rc = __feat_register_nn(fn, table[i].dependent_feat, table[i].is_mandatory, table[i].val); return rc; } /** * dccp_feat_finalise_settings - Finalise settings before starting negotiation * @dp: client or listening socket (settings will be inherited) * * This is called after all registrations (socket initialisation, sysctls, and * sockopt calls), and before sending the first packet containing Change options * (ie. client-Request or server-Response), to ensure internal consistency. */ int dccp_feat_finalise_settings(struct dccp_sock *dp) { struct list_head *fn = &dp->dccps_featneg; struct dccp_feat_entry *entry; int i = 2, ccids[2] = { -1, -1 }; /* * Propagating CCIDs: * 1) not useful to propagate CCID settings if this host advertises more * than one CCID: the choice of CCID may still change - if this is * the client, or if this is the server and the client sends * singleton CCID values. * 2) since is that propagate_ccid changes the list, we defer changing * the sorted list until after the traversal. */ list_for_each_entry(entry, fn, node) if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) ccids[entry->is_local] = entry->val.sp.vec[0]; while (i--) if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) return -1; dccp_feat_print_fnlist(fn); return 0; } /** * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features * @dreq: server socket to resolve * * It is the server which resolves the dependencies once the CCID has been * fully negotiated. If no CCID has been negotiated, it uses the default CCID. */ int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) { struct list_head *fn = &dreq->dreq_featneg; struct dccp_feat_entry *entry; u8 is_local, ccid; for (is_local = 0; is_local <= 1; is_local++) { entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); if (entry != NULL && !entry->empty_confirm) ccid = entry->val.sp.vec[0]; else ccid = dccp_feat_default_value(DCCPF_CCID); if (dccp_feat_propagate_ccid(fn, ccid, is_local)) return -1; } return 0; } /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) { u8 c, s; for (s = 0; s < slen; s++) for (c = 0; c < clen; c++) if (servlist[s] == clilist[c]) return servlist[s]; return -1; } /** * dccp_feat_prefer - Move preferred entry to the start of array * @preferred_value: entry to move to start of array * @array: array of preferred entries * @array_len: size of the array * * Reorder the @array_len elements in @array so that @preferred_value comes * first. Returns >0 to indicate that @preferred_value does occur in @array. */ static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) { u8 i, does_occur = 0; if (array != NULL) { for (i = 0; i < array_len; i++) if (array[i] == preferred_value) { array[i] = array[0]; does_occur++; } if (does_occur) array[0] = preferred_value; } return does_occur; } /** * dccp_feat_reconcile - Reconcile SP preference lists * @fv: SP list to reconcile into * @arr: received SP preference list * @len: length of @arr in bytes * @is_server: whether this side is the server (and @fv is the server's list) * @reorder: whether to reorder the list in @fv after reconciling with @arr * When successful, > 0 is returned and the reconciled list is in @fval. * A value of 0 means that negotiation failed (no shared entry). */ static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, bool is_server, bool reorder) { int rc; if (!fv->sp.vec || !arr) { DCCP_CRIT("NULL feature value or array"); return 0; } if (is_server) rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); else rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); if (!reorder) return rc; if (rc < 0) return 0; /* * Reorder list: used for activating features and in dccp_insert_fn_opt. */ return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); } /** * dccp_feat_change_recv - Process incoming ChangeL/R options * @fn: feature-negotiation list to update * @is_mandatory: whether the Change was preceded by a Mandatory option * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R * @feat: one of %dccp_feature_numbers * @val: NN value or SP value/preference list * @len: length of @val in bytes * @server: whether this node is the server (1) or the client (0) */ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, u8 feat, u8 *val, u8 len, const bool server) { u8 defval, type = dccp_feat_type(feat); const bool local = (opt == DCCPO_CHANGE_R); struct dccp_feat_entry *entry; dccp_feat_val fval; if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ goto unknown_feature_or_value; dccp_feat_print_opt(opt, feat, val, len, is_mandatory); /* * Negotiation of NN features: Change R is invalid, so there is no * simultaneous negotiation; hence we do not look up in the list. */ if (type == FEAT_NN) { if (local || len > sizeof(fval.nn)) goto unknown_feature_or_value; /* 6.3.2: "The feature remote MUST accept any valid value..." */ fval.nn = dccp_decode_value_var(val, len); if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) goto unknown_feature_or_value; return dccp_feat_push_confirm(fn, feat, local, &fval); } /* * Unidirectional/simultaneous negotiation of SP features (6.3.1) */ entry = dccp_feat_list_lookup(fn, feat, local); if (entry == NULL) { /* * No particular preferences have been registered. We deal with * this situation by assuming that all valid values are equally * acceptable, and apply the following checks: * - if the peer's list is a singleton, we accept a valid value; * - if we are the server, we first try to see if the peer (the * client) advertises the default value. If yes, we use it, * otherwise we accept the preferred value; * - else if we are the client, we use the first list element. */ if (dccp_feat_clone_sp_val(&fval, val, 1)) return DCCP_RESET_CODE_TOO_BUSY; if (len > 1 && server) { defval = dccp_feat_default_value(feat); if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) fval.sp.vec[0] = defval; } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { kfree(fval.sp.vec); goto unknown_feature_or_value; } /* Treat unsupported CCIDs like invalid values */ if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { kfree(fval.sp.vec); goto not_valid_or_not_known; } if (dccp_feat_push_confirm(fn, feat, local, &fval)) { kfree(fval.sp.vec); return DCCP_RESET_CODE_TOO_BUSY; } return 0; } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ return 0; } if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { entry->empty_confirm = false; } else if (is_mandatory) { return DCCP_RESET_CODE_MANDATORY_ERROR; } else if (entry->state == FEAT_INITIALISING) { /* * Failed simultaneous negotiation (server only): try to `save' * the connection by checking whether entry contains the default * value for @feat. If yes, send an empty Confirm to signal that * the received Change was not understood - which implies using * the default value. * If this also fails, we use Reset as the last resort. */ WARN_ON(!server); defval = dccp_feat_default_value(feat); if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) return DCCP_RESET_CODE_OPTION_ERROR; entry->empty_confirm = true; } entry->needs_confirm = true; entry->needs_mandatory = false; entry->state = FEAT_STABLE; return 0; unknown_feature_or_value: if (!is_mandatory) return dccp_push_empty_confirm(fn, feat, local); not_valid_or_not_known: return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR : DCCP_RESET_CODE_OPTION_ERROR; } /** * dccp_feat_confirm_recv - Process received Confirm options * @fn: feature-negotiation list to update * @is_mandatory: whether @opt was preceded by a Mandatory option * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R * @feat: one of %dccp_feature_numbers * @val: NN value or SP value/preference list * @len: length of @val in bytes * @server: whether this node is server (1) or client (0) */ static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, u8 feat, u8 *val, u8 len, const bool server) { u8 *plist, plen, type = dccp_feat_type(feat); const bool local = (opt == DCCPO_CONFIRM_R); struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); dccp_feat_print_opt(opt, feat, val, len, is_mandatory); if (entry == NULL) { /* nothing queued: ignore or handle error */ if (is_mandatory && type == FEAT_UNKNOWN) return DCCP_RESET_CODE_MANDATORY_ERROR; if (!local && type == FEAT_NN) /* 6.3.2 */ goto confirmation_failed; return 0; } if (entry->state != FEAT_CHANGING) /* 6.6.2 */ return 0; if (len == 0) { if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ goto confirmation_failed; /* * Empty Confirm during connection setup: this means reverting * to the `old' value, which in this case is the default. Since * we handle default values automatically when no other values * have been set, we revert to the old value by removing this * entry from the list. */ dccp_feat_list_pop(entry); return 0; } if (type == FEAT_NN) { if (len > sizeof(entry->val.nn)) goto confirmation_failed; if (entry->val.nn == dccp_decode_value_var(val, len)) goto confirmation_succeeded; DCCP_WARN("Bogus Confirm for non-existing value\n"); goto confirmation_failed; } /* * Parsing SP Confirms: the first element of @val is the preferred * SP value which the peer confirms, the remainder depends on @len. * Note that only the confirmed value need to be a valid SP value. */ if (!dccp_feat_is_valid_sp_val(feat, *val)) goto confirmation_failed; if (len == 1) { /* peer didn't supply a preference list */ plist = val; plen = len; } else { /* preferred value + preference list */ plist = val + 1; plen = len - 1; } /* Check whether the peer got the reconciliation right (6.6.8) */ if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { DCCP_WARN("Confirm selected the wrong value %u\n", *val); return DCCP_RESET_CODE_OPTION_ERROR; } entry->val.sp.vec[0] = *val; confirmation_succeeded: entry->state = FEAT_STABLE; return 0; confirmation_failed: DCCP_WARN("Confirmation failed\n"); return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR : DCCP_RESET_CODE_OPTION_ERROR; } /** * dccp_feat_handle_nn_established - Fast-path reception of NN options * @sk: socket of an established DCCP connection * @mandatory: whether @opt was preceded by a Mandatory option * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) * @feat: NN number, one of %dccp_feature_numbers * @val: NN value * @len: length of @val in bytes * * This function combines the functionality of change_recv/confirm_recv, with * the following differences (reset codes are the same): * - cleanup after receiving the Confirm; * - values are directly activated after successful parsing; * - deliberately restricted to NN features. * The restriction to NN features is essential since SP features can have non- * predictable outcomes (depending on the remote configuration), and are inter- * dependent (CCIDs for instance cause further dependencies). */ static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) { struct list_head *fn = &dccp_sk(sk)->dccps_featneg; const bool local = (opt == DCCPO_CONFIRM_R); struct dccp_feat_entry *entry; u8 type = dccp_feat_type(feat); dccp_feat_val fval; dccp_feat_print_opt(opt, feat, val, len, mandatory); /* Ignore non-mandatory unknown and non-NN features */ if (type == FEAT_UNKNOWN) { if (local && !mandatory) return 0; goto fast_path_unknown; } else if (type != FEAT_NN) { return 0; } /* * We don't accept empty Confirms, since in fast-path feature * negotiation the values are enabled immediately after sending * the Change option. * Empty Changes on the other hand are invalid (RFC 4340, 6.1). */ if (len == 0 || len > sizeof(fval.nn)) goto fast_path_unknown; if (opt == DCCPO_CHANGE_L) { fval.nn = dccp_decode_value_var(val, len); if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) goto fast_path_unknown; if (dccp_feat_push_confirm(fn, feat, local, &fval) || dccp_feat_activate(sk, feat, local, &fval)) return DCCP_RESET_CODE_TOO_BUSY; /* set the `Ack Pending' flag to piggyback a Confirm */ inet_csk_schedule_ack(sk); } else if (opt == DCCPO_CONFIRM_R) { entry = dccp_feat_list_lookup(fn, feat, local); if (entry == NULL || entry->state != FEAT_CHANGING) return 0; fval.nn = dccp_decode_value_var(val, len); /* * Just ignore a value that doesn't match our current value. * If the option changes twice within two RTTs, then at least * one CONFIRM will be received for the old value after a * new CHANGE was sent. */ if (fval.nn != entry->val.nn) return 0; /* Only activate after receiving the Confirm option (6.6.1). */ dccp_feat_activate(sk, feat, local, &fval); /* It has been confirmed - so remove the entry */ dccp_feat_list_pop(entry); } else { DCCP_WARN("Received illegal option %u\n", opt); goto fast_path_failed; } return 0; fast_path_unknown: if (!mandatory) return dccp_push_empty_confirm(fn, feat, local); fast_path_failed: return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR : DCCP_RESET_CODE_OPTION_ERROR; } /** * dccp_feat_parse_options - Process Feature-Negotiation Options * @sk: for general use and used by the client during connection setup * @dreq: used by the server during connection setup * @mandatory: whether @opt was preceded by a Mandatory option * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R * @feat: one of %dccp_feature_numbers * @val: value contents of @opt * @len: length of @val in bytes * * Returns 0 on success, a Reset code for ending the connection otherwise. */ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) { struct dccp_sock *dp = dccp_sk(sk); struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; bool server = false; switch (sk->sk_state) { /* * Negotiation during connection setup */ case DCCP_LISTEN: server = true; fallthrough; case DCCP_REQUESTING: switch (opt) { case DCCPO_CHANGE_L: case DCCPO_CHANGE_R: return dccp_feat_change_recv(fn, mandatory, opt, feat, val, len, server); case DCCPO_CONFIRM_R: case DCCPO_CONFIRM_L: return dccp_feat_confirm_recv(fn, mandatory, opt, feat, val, len, server); } break; /* * Support for exchanging NN options on an established connection. */ case DCCP_OPEN: case DCCP_PARTOPEN: return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, val, len); } return 0; /* ignore FN options in all other states */ } /** * dccp_feat_init - Seed feature negotiation with host-specific defaults * @sk: Socket to initialize. * * This initialises global defaults, depending on the value of the sysctls. * These can later be overridden by registering changes via setsockopt calls. * The last link in the chain is finalise_settings, to make sure that between * here and the start of actual feature negotiation no inconsistencies enter. * * All features not appearing below use either defaults or are otherwise * later adjusted through dccp_feat_finalise_settings(). */ int dccp_feat_init(struct sock *sk) { struct list_head *fn = &dccp_sk(sk)->dccps_featneg; u8 on = 1, off = 0; int rc; struct { u8 *val; u8 len; } tx, rx; /* Non-negotiable (NN) features */ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, sysctl_dccp_sequence_window); if (rc) return rc; /* Server-priority (SP) features */ /* Advertise that short seqnos are not supported (7.6.1) */ rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); if (rc) return rc; /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); if (rc) return rc; /* * We advertise the available list of CCIDs and reorder according to * preferences, to avoid failure resulting from negotiating different * singleton values (which always leads to failure). * These settings can still (later) be overridden via sockopts. */ if (ccid_get_builtin_ccids(&tx.val, &tx.len)) return -ENOBUFS; if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { kfree(tx.val); return -ENOBUFS; } if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) goto free_ccid_lists; rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); if (rc) goto free_ccid_lists; rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); free_ccid_lists: kfree(tx.val); kfree(rx.val); return rc; } int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_feat_entry *cur, *next; int idx; dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } }; list_for_each_entry(cur, fn_list, node) { /* * An empty Confirm means that either an unknown feature type * or an invalid value was present. In the first case there is * nothing to activate, in the other the default value is used. */ if (cur->empty_confirm) continue; idx = dccp_feat_index(cur->feat_num); if (idx < 0) { DCCP_BUG("Unknown feature %u", cur->feat_num); goto activation_failed; } if (cur->state != FEAT_STABLE) { DCCP_CRIT("Negotiation of %s %s failed in state %s", cur->is_local ? "local" : "remote", dccp_feat_fname(cur->feat_num), dccp_feat_sname[cur->state]); goto activation_failed; } fvals[idx][cur->is_local] = &cur->val; } /* * Activate in decreasing order of index, so that the CCIDs are always * activated as the last feature. This avoids the case where a CCID * relies on the initialisation of one or more features that it depends * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). */ for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { DCCP_CRIT("Could not activate %d", idx); goto activation_failed; } /* Clean up Change options which have been confirmed already */ list_for_each_entry_safe(cur, next, fn_list, node) if (!cur->needs_confirm) dccp_feat_list_pop(cur); dccp_pr_debug("Activation OK\n"); return 0; activation_failed: /* * We clean up everything that may have been allocated, since * it is difficult to track at which stage negotiation failed. * This is ok, since all allocation functions below are robust * against NULL arguments. */ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; return -1; } |
20918 2525 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits * (selected with (bit>>5) give us the word number and the low 5 * bits give us the bit/feature number inside the word. * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can * see if it is set in the mask word. */ #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) /* * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all * header macros which use NCAPINTS need to be changed. The duplicated macro * use causes the compiler to issue errors for all headers so that all usage * sites can be corrected. */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */ |
693 687 686 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 | /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */ |
15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 | /* SPDX-License-Identifier: GPL-2.0 */ /* * XDR standard data types and function declarations * * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> * * Based on: * RFC 4506 "XDR: External Data Representation Standard", May 2006 */ #ifndef _SUNRPC_XDR_H_ #define _SUNRPC_XDR_H_ #include <linux/uio.h> #include <asm/byteorder.h> #include <linux/unaligned.h> #include <linux/scatterlist.h> struct bio_vec; struct rpc_rqst; /* * Size of an XDR encoding unit in bytes, i.e. 32 bits, * as defined in Section 3 of RFC 4506. All encoded * XDR data items are aligned on a boundary of 32 bits. */ #define XDR_UNIT sizeof(__be32) /* * Buffer adjustment */ #define XDR_QUADLEN(l) (((l) + 3) >> 2) /* * Generic opaque `network object.' */ #define XDR_MAX_NETOBJ 1024 struct xdr_netobj { unsigned int len; u8 * data; }; /* * Basic structure for transmission/reception of a client XDR message. * Features a header (for a linear buffer containing RPC headers * and the data payload for short messages), and then an array of * pages. * The tail iovec allows you to append data after the page array. Its * main interest is for appending padding to the pages in order to * satisfy the int_32-alignment requirements in RFC1832. * * For the future, we might want to string several of these together * in a list if anybody wants to make use of NFSv4 COMPOUND * operations and/or has a need for scatter/gather involving pages. */ struct xdr_buf { struct kvec head[1], /* RPC header + non-page data */ tail[1]; /* Appended after page data */ struct bio_vec *bvec; struct page ** pages; /* Array of pages */ unsigned int page_base, /* Start of page data */ page_len, /* Length of page data */ flags; /* Flags for data disposition */ #define XDRBUF_READ 0x01 /* target of file read */ #define XDRBUF_WRITE 0x02 /* source of file write */ #define XDRBUF_SPARSE_PAGES 0x04 /* Page array is sparse */ unsigned int buflen, /* Total length of storage buffer */ len; /* Length of XDR encoded message */ }; static inline void xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) { buf->head[0].iov_base = start; buf->head[0].iov_len = len; buf->tail[0].iov_len = 0; buf->pages = NULL; buf->page_len = 0; buf->flags = 0; buf->len = 0; buf->buflen = len; } /* * pre-xdr'ed macros. */ #define xdr_zero cpu_to_be32(0) #define xdr_one cpu_to_be32(1) #define xdr_two cpu_to_be32(2) #define rpc_auth_null cpu_to_be32(RPC_AUTH_NULL) #define rpc_auth_unix cpu_to_be32(RPC_AUTH_UNIX) #define rpc_auth_short cpu_to_be32(RPC_AUTH_SHORT) #define rpc_auth_gss cpu_to_be32(RPC_AUTH_GSS) #define rpc_auth_tls cpu_to_be32(RPC_AUTH_TLS) #define rpc_call cpu_to_be32(RPC_CALL) #define rpc_reply cpu_to_be32(RPC_REPLY) #define rpc_msg_accepted cpu_to_be32(RPC_MSG_ACCEPTED) #define rpc_success cpu_to_be32(RPC_SUCCESS) #define rpc_prog_unavail cpu_to_be32(RPC_PROG_UNAVAIL) #define rpc_prog_mismatch cpu_to_be32(RPC_PROG_MISMATCH) #define rpc_proc_unavail cpu_to_be32(RPC_PROC_UNAVAIL) #define rpc_garbage_args cpu_to_be32(RPC_GARBAGE_ARGS) #define rpc_system_err cpu_to_be32(RPC_SYSTEM_ERR) #define rpc_drop_reply cpu_to_be32(RPC_DROP_REPLY) #define rpc_mismatch cpu_to_be32(RPC_MISMATCH) #define rpc_auth_error cpu_to_be32(RPC_AUTH_ERROR) #define rpc_auth_ok cpu_to_be32(RPC_AUTH_OK) #define rpc_autherr_badcred cpu_to_be32(RPC_AUTH_BADCRED) #define rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED) #define rpc_autherr_badverf cpu_to_be32(RPC_AUTH_BADVERF) #define rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF) #define rpc_autherr_tooweak cpu_to_be32(RPC_AUTH_TOOWEAK) #define rpcsec_gsserr_credproblem cpu_to_be32(RPCSEC_GSS_CREDPROBLEM) #define rpcsec_gsserr_ctxproblem cpu_to_be32(RPCSEC_GSS_CTXPROBLEM) /* * Miscellaneous XDR helper functions */ __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_string(__be32 *p, const char *s); __be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp, unsigned int maxlen); __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); void xdr_terminate_string(const struct xdr_buf *, const u32); size_t xdr_buf_pagecount(const struct xdr_buf *buf); int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); void xdr_free_bvec(struct xdr_buf *buf); unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size, const struct xdr_buf *xdr); static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int len) { return xdr_encode_opaque(p, s, len); } /* * Decode 64bit quantities (NFSv3 support) */ static inline __be32 * xdr_encode_hyper(__be32 *p, __u64 val) { put_unaligned_be64(val, p); return p + 2; } static inline __be32 * xdr_decode_hyper(__be32 *p, __u64 *valp) { *valp = get_unaligned_be64(p); return p + 2; } static inline __be32 * xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) { memcpy(ptr, p, len); return p + XDR_QUADLEN(len); } static inline void xdr_netobj_dup(struct xdr_netobj *dst, struct xdr_netobj *src, gfp_t gfp_mask) { dst->data = kmemdup(src->data, src->len, gfp_mask); dst->len = src->len; } /* * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) */ static inline int xdr_adjust_iovec(struct kvec *iov, __be32 *p) { return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base); } /* * XDR buffer helper functions */ extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); extern void xdr_buf_trim(struct xdr_buf *, unsigned int); extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32); extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *); struct xdr_array2_desc; typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem); struct xdr_array2_desc { unsigned int elem_size; unsigned int array_len; unsigned int array_maxlen; xdr_xcode_elem_t xcode; }; extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len); /* * Provide some simple tools for XDR buffer overflow-checking etc. */ struct xdr_stream { __be32 *p; /* start of available buffer */ struct xdr_buf *buf; /* XDR buffer to read/write */ __be32 *end; /* end of available buffer space */ struct kvec *iov; /* pointer to the current kvec */ struct kvec scratch; /* Scratch buffer */ struct page **page_ptr; /* pointer to the current page */ void *page_kaddr; /* kmapped address of the current page */ unsigned int nwords; /* Remaining decode buffer length */ struct rpc_rqst *rqst; /* For debugging */ }; /* * These are the xdr_stream style generic XDR encode and decode functions. */ typedef void (*kxdreproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, const void *obj); typedef int (*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *obj); extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, struct rpc_rqst *rqst); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes); extern void __xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern void xdr_truncate_decode(struct xdr_stream *xdr, size_t len); extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr); extern unsigned int xdr_page_pos(const struct xdr_stream *xdr); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len); extern void xdr_finish_decode(struct xdr_stream *xdr); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int len); extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, unsigned int target, unsigned int length); extern unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, unsigned int length); /** * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. * @xdr: pointer to xdr_stream struct * @buf: pointer to an empty buffer * @buflen: size of 'buf' * * The scratch buffer is used when decoding from an array of pages. * If an xdr_inline_decode() call spans across page boundaries, then * we copy the data into the scratch buffer in order to allow linear * access. */ static inline void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) { xdr->scratch.iov_base = buf; xdr->scratch.iov_len = buflen; } /** * xdr_set_scratch_page - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct * @page: an anonymous page * * See xdr_set_scratch_buffer(). */ static inline void xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) { xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); } /** * xdr_reset_scratch_buffer - Clear scratch buffer information * @xdr: pointer to xdr_stream struct * * See xdr_set_scratch_buffer(). */ static inline void xdr_reset_scratch_buffer(struct xdr_stream *xdr) { xdr_set_scratch_buffer(xdr, NULL, 0); } /** * xdr_commit_encode - Ensure all data is written to xdr->buf * @xdr: pointer to xdr_stream * * Handle encoding across page boundaries by giving the caller a * temporary location to write to, then later copying the data into * place. __xdr_commit_encode() does that copying. */ static inline void xdr_commit_encode(struct xdr_stream *xdr) { if (unlikely(xdr->scratch.iov_len)) __xdr_commit_encode(xdr); } /** * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream * * Return value: * Number of bytes remaining in @xdr before xdr->end */ static inline size_t xdr_stream_remaining(const struct xdr_stream *xdr) { return xdr->nwords << 2; } ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size); ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, size_t maxlen, gfp_t gfp_flags); ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size); ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags); ssize_t xdr_stream_decode_opaque_auth(struct xdr_stream *xdr, u32 *flavor, void **body, unsigned int *body_len); ssize_t xdr_stream_encode_opaque_auth(struct xdr_stream *xdr, u32 flavor, void *body, unsigned int body_len); /** * xdr_align_size - Calculate padded size of an object * @n: Size of an object being XDR encoded (in bytes) * * Return value: * Size (in bytes) of the object including xdr padding */ static inline size_t xdr_align_size(size_t n) { const size_t mask = XDR_UNIT - 1; return (n + mask) & ~mask; } /** * xdr_pad_size - Calculate size of an object's pad * @n: Size of an object being XDR encoded (in bytes) * * This implementation avoids the need for conditional * branches or modulo division. * * Return value: * Size (in bytes) of the needed XDR pad */ static inline size_t xdr_pad_size(size_t n) { return xdr_align_size(n) - n; } /** * xdr_stream_encode_item_present - Encode a "present" list item * @xdr: pointer to xdr_stream * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = xdr_one; return len; } /** * xdr_stream_encode_item_absent - Encode a "not present" list item * @xdr: pointer to xdr_stream * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = xdr_zero; return len; } /** * xdr_encode_bool - Encode a boolean item * @p: address in a buffer into which to encode * @n: boolean value to encode * * Return value: * Address of item following the encoded boolean */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) { *p++ = n ? xdr_one : xdr_zero; return p; } /** * xdr_stream_encode_bool - Encode a boolean item * @xdr: pointer to xdr_stream * @n: boolean value to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n) { const size_t len = XDR_UNIT; __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_bool(p, n); return len; } /** * xdr_stream_encode_u32 - Encode a 32-bit integer * @xdr: pointer to xdr_stream * @n: integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_u32(struct xdr_stream *xdr, __u32 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = cpu_to_be32(n); return len; } /** * xdr_stream_encode_be32 - Encode a big-endian 32-bit integer * @xdr: pointer to xdr_stream * @n: integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_be32(struct xdr_stream *xdr, __be32 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; *p = n; return len; } /** * xdr_stream_encode_u64 - Encode a 64-bit integer * @xdr: pointer to xdr_stream * @n: 64-bit integer to encode * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_u64(struct xdr_stream *xdr, __u64 n) { const size_t len = sizeof(n); __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_hyper(p, n); return len; } /** * xdr_stream_encode_opaque_inline - Encode opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to void pointer * @len: size of object * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t len) { size_t count = sizeof(__u32) + xdr_align_size(len); __be32 *p = xdr_reserve_space(xdr, count); if (unlikely(!p)) { *ptr = NULL; return -EMSGSIZE; } xdr_encode_opaque(p, NULL, len); *ptr = ++p; return count; } /** * xdr_stream_encode_opaque_fixed - Encode fixed length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque_fixed(struct xdr_stream *xdr, const void *ptr, size_t len) { __be32 *p = xdr_reserve_space(xdr, len); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_opaque_fixed(p, ptr, len); return xdr_align_size(len); } /** * xdr_stream_encode_opaque - Encode variable length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: pointer to opaque data object * @len: size of object pointed to by @ptr * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_opaque(struct xdr_stream *xdr, const void *ptr, size_t len) { size_t count = sizeof(__u32) + xdr_align_size(len); __be32 *p = xdr_reserve_space(xdr, count); if (unlikely(!p)) return -EMSGSIZE; xdr_encode_opaque(p, ptr, len); return count; } /** * xdr_stream_encode_uint32_array - Encode variable length array of integers * @xdr: pointer to xdr_stream * @array: array of integers * @array_size: number of elements in @array * * Return values: * On success, returns length in bytes of XDR buffer consumed * %-EMSGSIZE on XDR buffer overflow */ static inline ssize_t xdr_stream_encode_uint32_array(struct xdr_stream *xdr, const __u32 *array, size_t array_size) { ssize_t ret = (array_size+1) * sizeof(__u32); __be32 *p = xdr_reserve_space(xdr, ret); if (unlikely(!p)) return -EMSGSIZE; *p++ = cpu_to_be32(array_size); for (; array_size > 0; p++, array++, array_size--) *p = cpu_to_be32p(array); return ret; } /** * xdr_item_is_absent - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * * Return values: * %true if the following XDR item is absent * %false if the following XDR item is present */ static inline bool xdr_item_is_absent(const __be32 *p) { return *p == xdr_zero; } /** * xdr_item_is_present - symbolically handle XDR discriminators * @p: pointer to undecoded discriminator * * Return values: * %true if the following XDR item is present * %false if the following XDR item is absent */ static inline bool xdr_item_is_present(const __be32 *p) { return *p != xdr_zero; } /** * xdr_stream_decode_bool - Decode a boolean * @xdr: pointer to xdr_stream * @ptr: pointer to a u32 in which to store the result * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = (*p != xdr_zero); return 0; } /** * xdr_stream_decode_u32 - Decode a 32-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = be32_to_cpup(p); return 0; } /** * xdr_stream_decode_be32 - Decode a big-endian 32-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; *ptr = *p; return 0; } /** * xdr_stream_decode_u64 - Decode a 64-bit integer * @xdr: pointer to xdr_stream * @ptr: location to store 64-bit integer * * Return values: * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) { const size_t count = sizeof(*ptr); __be32 *p = xdr_inline_decode(xdr, count); if (unlikely(!p)) return -EBADMSG; xdr_decode_hyper(p, ptr); return 0; } /** * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: location to store data * @len: size of buffer pointed to by @ptr * * Return values: * On success, returns size of object stored in @ptr * %-EBADMSG on XDR buffer overflow */ static inline ssize_t xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) { __be32 *p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EBADMSG; xdr_decode_opaque_fixed(p, ptr, len); return len; } /** * xdr_stream_decode_opaque_inline - Decode variable length opaque xdr data * @xdr: pointer to xdr_stream * @ptr: location to store pointer to opaque data * @maxlen: maximum acceptable object size * * Note: the pointer stored in @ptr cannot be assumed valid after the XDR * buffer has been destroyed, or even after calling xdr_inline_decode() * on @xdr. It is therefore expected that the object it points to should * be processed immediately. * * Return values: * On success, returns size of object stored in *@ptr * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the object would exceed @maxlen */ static inline ssize_t xdr_stream_decode_opaque_inline(struct xdr_stream *xdr, void **ptr, size_t maxlen) { __be32 *p; __u32 len; *ptr = NULL; if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; if (len != 0) { p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EBADMSG; if (unlikely(len > maxlen)) return -EMSGSIZE; *ptr = p; } return len; } /** * xdr_stream_decode_uint32_array - Decode variable length array of integers * @xdr: pointer to xdr_stream * @array: location to store the integer array or NULL * @array_size: number of elements to store * * Return values: * On success, returns number of elements stored in @array * %-EBADMSG on XDR buffer overflow * %-EMSGSIZE if the size of the array exceeds @array_size */ static inline ssize_t xdr_stream_decode_uint32_array(struct xdr_stream *xdr, __u32 *array, size_t array_size) { __be32 *p; __u32 len; ssize_t retval; if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; if (U32_MAX >= SIZE_MAX / sizeof(*p) && len > SIZE_MAX / sizeof(*p)) return -EBADMSG; p = xdr_inline_decode(xdr, len * sizeof(*p)); if (unlikely(!p)) return -EBADMSG; if (array == NULL) return len; if (len <= array_size) { if (len < array_size) memset(array+len, 0, (array_size-len)*sizeof(*array)); array_size = len; retval = len; } else retval = -EMSGSIZE; for (; array_size > 0; p++, array++, array_size--) *array = be32_to_cpup(p); return retval; } #endif /* _SUNRPC_XDR_H_ */ |
9 9 9 9 9 9 9 9 4 4 4 9 9 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 | // SPDX-License-Identifier: GPL-2.0-or-later /* Request a key from userspace * * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/request-key.rst */ #include <linux/export.h> #include <linux/sched.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/keyctl.h> #include <linux/slab.h> #include <net/net_namespace.h> #include "internal.h" #include <keys/request_key_auth-type.h> #define key_negative_timeout 60 /* default timeout on a negative key's existence */ static struct key *check_cached_key(struct keyring_search_context *ctx) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct key *key = current->cached_requested_key; if (key && ctx->match_data.cmp(key, &ctx->match_data) && !(key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED)))) return key_get(key); #endif return NULL; } static void cache_requested_key(struct key *key) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct task_struct *t = current; /* Do not cache key if it is a kernel thread */ if (!(t->flags & PF_KTHREAD)) { key_put(t->cached_requested_key); t->cached_requested_key = key_get(key); set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } #endif } /** * complete_request_key - Complete the construction of a key. * @authkey: The authorisation key. * @error: The success or failute of the construction. * * Complete the attempt to construct a key. The key will be negated * if an error is indicated. The authorisation key will be revoked * unconditionally. */ void complete_request_key(struct key *authkey, int error) { struct request_key_auth *rka = get_request_key_auth(authkey); struct key *key = rka->target_key; kenter("%d{%d},%d", authkey->serial, key->serial, error); if (error < 0) key_negate_and_link(key, key_negative_timeout, NULL, authkey); else key_revoke(authkey); } EXPORT_SYMBOL(complete_request_key); /* * Initialise a usermode helper that is going to have a specific session * keyring. * * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); } /* * Clean up a usermode helper with session keyring. */ static void umh_keys_cleanup(struct subprocess_info *info) { struct key *keyring = info->data; key_put(keyring); } /* * Call a usermode helper with a specific session keyring. */ static int call_usermodehelper_keys(const char *path, char **argv, char **envp, struct key *session_keyring, int wait) { struct subprocess_info *info; info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL, umh_keys_init, umh_keys_cleanup, session_keyring); if (!info) return -ENOMEM; key_get(session_keyring); return call_usermodehelper_exec(info, wait); } /* * Request userspace finish the construction of a key * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>" */ static int call_sbin_request_key(struct key *authkey, void *aux) { static char const request_key[] = "/sbin/request-key"; struct request_key_auth *rka = get_request_key_auth(authkey); const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = rka->target_key, *keyring, *session, *user_session; char *argv[9], *envp[3], uid_str[12], gid_str[12]; char key_str[12], keyring_str[3][12]; char desc[20]; int ret, i; kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op); ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error_us; /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; } /* attach the auth key to the session keyring */ ret = key_link(keyring, authkey); if (ret < 0) goto error_link; /* record the UID and GID */ sprintf(uid_str, "%d", from_kuid(&init_user_ns, cred->fsuid)); sprintf(gid_str, "%d", from_kgid(&init_user_ns, cred->fsgid)); /* we say which key is under construction */ sprintf(key_str, "%d", key->serial); /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; if (cred->process_keyring) prkey = cred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); session = cred->session_keyring; if (!session) session = user_session; sskey = session->serial; sprintf(keyring_str[2], "%d", sskey); /* set up a minimal environment */ i = 0; envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; /* set up the argument list */ i = 0; argv[i++] = (char *)request_key; argv[i++] = (char *)rka->op; argv[i++] = key_str; argv[i++] = uid_str; argv[i++] = gid_str; argv[i++] = keyring_str[0]; argv[i++] = keyring_str[1]; argv[i++] = keyring_str[2]; argv[i] = NULL; /* do it */ ret = call_usermodehelper_keys(request_key, argv, envp, keyring, UMH_WAIT_PROC); kdebug("usermode -> 0x%x", ret); if (ret >= 0) { /* ret is the exit/wait code */ if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags) || key_validate(key) < 0) ret = -ENOKEY; else /* ignore any errors from userspace if the key was * instantiated */ ret = 0; } error_link: key_put(keyring); error_alloc: key_put(user_session); error_us: complete_request_key(authkey, ret); kleave(" = %d", ret); return ret; } /* * Call out to userspace for key construction. * * Program failure is ignored in favour of key status. */ static int construct_key(struct key *key, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring) { request_key_actor_t actor; struct key *authkey; int ret; kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux); /* allocate an authorisation key */ authkey = request_key_auth_new(key, "create", callout_info, callout_len, dest_keyring); if (IS_ERR(authkey)) return PTR_ERR(authkey); /* Make the call */ actor = call_sbin_request_key; if (key->type->request_key) actor = key->type->request_key; ret = actor(authkey, aux); /* check that the actor called complete_request_key() prior to * returning an error */ WARN_ON(ret < 0 && !test_bit(KEY_FLAG_INVALIDATED, &authkey->flags)); key_put(authkey); kleave(" = %d", ret); return ret; } /* * Get the appropriate destination keyring for the request. * * The keyring selected is returned with an extra reference upon it which the * caller must release. */ static int construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; int ret; kenter("%p", dest_keyring); /* find the appropriate keyring */ if (dest_keyring) { /* the caller supplied one */ key_get(dest_keyring); } else { bool do_perm_check = true; /* use a default keyring; falling through the cases until we * find one that we actually have */ switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); rka = get_request_key_auth(authkey); if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = key_get(rka->dest_keyring); up_read(&authkey->sem); if (dest_keyring) { do_perm_check = false; break; } } fallthrough; case KEY_REQKEY_DEFL_THREAD_KEYRING: dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_PROCESS_KEYRING: dest_keyring = key_get(cred->process_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_SESSION_KEYRING: dest_keyring = key_get(cred->session_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &dest_keyring); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_USER_KEYRING: ret = look_up_user_keyrings(&dest_keyring, NULL); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: BUG(); } /* * Require Write permission on the keyring. This is essential * because the default keyring may be the session keyring, and * joining a keyring only requires Search permission. * * However, this check is skipped for the "requestor keyring" so * that /sbin/request-key can itself use request_key() to add * keys to the original requestor's destination keyring. */ if (dest_keyring && do_perm_check) { ret = key_permission(make_key_ref(dest_keyring, 1), KEY_NEED_WRITE); if (ret) { key_put(dest_keyring); return ret; } } } *_dest_keyring = dest_keyring; kleave(" [dk %d]", key_serial(dest_keyring)); return 0; } /* * Allocate a new key in under-construction state and attempt to link it in to * the requested keyring. * * May return a key that's already under construction instead if there was a * race between two thread calling request_key(). */ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, struct key **_key) { struct assoc_array_edit *edit = NULL; struct key *key; key_perm_t perm; key_ref_t key_ref; int ret; kenter("%s,%s,,,", ctx->index_key.type->name, ctx->index_key.description); *_key = NULL; mutex_lock(&user->cons_lock); perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (ctx->index_key.type->read) perm |= KEY_POS_READ; if (ctx->index_key.type == &key_type_keyring || ctx->index_key.type->update) perm |= KEY_POS_WRITE; key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, perm, flags, NULL); if (IS_ERR(key)) goto alloc_failed; set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { ret = __key_link_lock(dest_keyring, &key->index_key); if (ret < 0) goto link_lock_failed; } /* * Attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we * waited for locks. * * The caller might specify a comparison function which looks for keys * that do not exactly match but are still equivalent from the caller's * perspective. The __key_link_begin() operation must be done only after * an actual key is determined. */ mutex_lock(&key_construction_mutex); rcu_read_lock(); key_ref = search_process_keyrings_rcu(ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) goto key_already_present; if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed; __key_link(dest_keyring, key, &edit); } mutex_unlock(&key_construction_mutex); if (dest_keyring) __key_link_end(dest_keyring, &key->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); return 0; /* the key is now present - we tell the caller that we found it by * returning -EINPROGRESS */ key_already_present: key_put(key); mutex_unlock(&key_construction_mutex); key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed_unlocked; ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &key->index_key, edit); if (ret < 0) goto link_check_failed; } mutex_unlock(&user->cons_lock); *_key = key; kleave(" = -EINPROGRESS [%d]", key_serial(key)); return -EINPROGRESS; link_check_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [linkcheck]", ret); return ret; link_alloc_failed: mutex_unlock(&key_construction_mutex); link_alloc_failed_unlocked: __key_link_end(dest_keyring, &key->index_key, edit); link_lock_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [prelink]", ret); return ret; alloc_failed: mutex_unlock(&user->cons_lock); kleave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } /* * Commence key construction. */ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct key_user *user; struct key *key; int ret; kenter(""); if (ctx->index_key.type == &key_type_keyring) return ERR_PTR(-EPERM); ret = construct_get_dest_keyring(&dest_keyring); if (ret) goto error; user = key_user_lookup(current_fsuid()); if (!user) { ret = -ENOMEM; goto error_put_dest_keyring; } ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { ret = construct_key(key, callout_info, callout_len, aux, dest_keyring); if (ret < 0) { kdebug("cons failed"); goto construction_failed; } } else if (ret == -EINPROGRESS) { ret = 0; } else { goto error_put_dest_keyring; } key_put(dest_keyring); kleave(" = key %d", key_serial(key)); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); error_put_dest_keyring: key_put(dest_keyring); error: kleave(" = %d", ret); return ERR_PTR(ret); } /** * request_key_and_link - Request a key and cache it in a keyring. * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * * A key matching the specified criteria (type, description, domain_tag) is * searched for in the process's keyrings and returned with its usage count * incremented if found. Otherwise, if callout_info is not NULL, a key will be * allocated and some service (probably in userspace) will be asked to * instantiate it. * * If successfully found or created, the key will be linked to the destination * keyring if one is provided. * * Returns a pointer to the key if successful; -EACCES, -ENOKEY, -EKEYREVOKED * or -EKEYEXPIRED if an inaccessible, negative, revoked or expired key was * found; -ENOKEY if no key was found and no @callout_info was given; -EDQUOT * if insufficient key quota was available to create a new key; or -ENOMEM if * insufficient memory was available. * * If the returned key was created, then it may still be under construction, * and wait_for_key_construction() should be used to wait for that to complete. */ struct key *request_key_and_link(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED | KEYRING_SEARCH_RECURSE), }; struct key *key; key_ref_t key_ref; int ret; kenter("%s,%s,%p,%zu,%p,%p,%lx", ctx.index_key.type->name, ctx.index_key.description, callout_info, callout_len, aux, dest_keyring, flags); if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) { key = ERR_PTR(ret); goto error; } } key = check_cached_key(&ctx); if (key) goto error_free; /* search all the process keyrings for a key */ rcu_read_lock(); key_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) { if (dest_keyring) { ret = key_task_permission(key_ref, current_cred(), KEY_NEED_LINK); if (ret < 0) { key_ref_put(key_ref); key = ERR_PTR(ret); goto error_free; } } key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = key_link(dest_keyring, key); if (ret < 0) { key_put(key); key = ERR_PTR(ret); goto error_free; } } /* Only cache the key on immediate success */ cache_requested_key(key); } else if (PTR_ERR(key_ref) != -EAGAIN) { key = ERR_CAST(key_ref); } else { /* the search failed, but the keyrings were searchable, so we * should consult userspace if we can */ key = ERR_PTR(-ENOKEY); if (!callout_info) goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, aux, dest_keyring, flags); } error_free: if (type->match_free) type->match_free(&ctx.match_data); error: kleave(" = %p", key); return key; } /** * wait_for_key_construction - Wait for construction of a key to complete * @key: The key being waited for. * @intr: Whether to wait interruptibly. * * Wait for a key to finish being constructed. * * Returns 0 if successful; -ERESTARTSYS if the wait was interrupted; -ENOKEY * if the key was negated; or -EKEYREVOKED or -EKEYEXPIRED if the key was * revoked or expired. */ int wait_for_key_construction(struct key *key, bool intr) { int ret; ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; ret = key_read_state(key); if (ret < 0) return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); /** * request_key_tag - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, * the callout_info must be a NUL-terminated string and no auxiliary data can * be passed. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info) { struct key *key; size_t callout_len = 0; int ret; if (callout_info) callout_len = strlen(callout_info); key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, NULL, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_tag); /** * request_key_with_auxdata - Request a key with auxiliary data for the upcaller * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux) { struct key *key; int ret; key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_with_auxdata); /** * request_key_rcu - Request key from RCU-read-locked context * @type: The type of key we want. * @description: The name of the key we want. * @domain_tag: The domain in which the key operates. * * Request a key from a context that we may not sleep in (such as RCU-mode * pathwalk). Keys under construction are ignored. * * Return a pointer to the found key if successful, -ENOKEY if we couldn't find * a key or some other error if the key found was unsuitable or inaccessible. */ struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED), }; struct key *key; key_ref_t key_ref; kenter("%s,%s", type->name, description); key = check_cached_key(&ctx); if (key) return key; /* search all the process keyrings for a key */ key_ref = search_process_keyrings_rcu(&ctx); if (IS_ERR(key_ref)) { key = ERR_CAST(key_ref); if (PTR_ERR(key_ref) == -EAGAIN) key = ERR_PTR(-ENOKEY); } else { key = key_ref_to_ptr(key_ref); cache_requested_key(key); } kleave(" = %p", key); return key; } EXPORT_SYMBOL(request_key_rcu); |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Device core Trace Support * Copyright (C) 2021, Intel Corporation * * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dev #if !defined(__DEV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __DEV_TRACE_H #include <linux/device.h> #include <linux/tracepoint.h> #include <linux/types.h> DECLARE_EVENT_CLASS(devres, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size), TP_STRUCT__entry( __string(devname, dev_name(dev)) __field(struct device *, dev) __field(const char *, op) __field(void *, node) __string(name, name) __field(size_t, size) ), TP_fast_assign( __assign_str(devname); __entry->op = op; __entry->node = node; __assign_str(name); __entry->size = size; ), TP_printk("%s %3s %p %s (%zu bytes)", __get_str(devname), __entry->op, __entry->node, __get_str(name), __entry->size) ); DEFINE_EVENT(devres, devres_log, TP_PROTO(struct device *dev, const char *op, void *node, const char *name, size_t size), TP_ARGS(dev, op, node, name, size) ); #endif /* __DEV_TRACE_H */ /* this part has to be here */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
1 12 26 4 24 21 12 13 1 12 7 20 20 17 6 6 3 3 3 3 4 1 1 1 1 100 17 10 2 6 1 9 4 26 85 111 111 89 89 2 1 1 7 5 3 25 5 3 2 1 5 1 1 1 3 3 2 1 4 9 9 2 19 2 8 3 1 11 13 11 3 25 25 1 1 18 18 5 5 2130 2130 1151 190 502 502 18 27 27 27 2 502 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ /* Devmaps primary use is as a backend map for XDP BPF helper call * bpf_redirect_map(). Because XDP is mostly concerned with performance we * spent some effort to ensure the datapath with redirect maps does not use * any locking. This is a quick note on the details. * * We have three possible paths to get into the devmap control plane bpf * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall * will invoke an update, delete, or lookup operation. To ensure updates and * deletes appear atomic from the datapath side xchg() is used to modify the * netdev_map array. Then because the datapath does a lookup into the netdev_map * array (read-only) from an RCU critical section we use call_rcu() to wait for * an rcu grace period before free'ing the old data structures. This ensures the * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. * * Finally, any of the above may race with a netdev_unregister notifier. The * unregister notifier must search for net devices in the map structure that * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. * When removing the dev a cmpxchg() is used to ensure the correct dev is * removed, in the case of a concurrent update or delete operation it is * possible that the initially referenced dev is no longer in the map. As the * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. * * The devmap_hash type is a map type which interprets keys as ifindexes and * indexes these using a hashmap. This allows maps that use ifindex as key to be * densely packed instead of having holes in the lookup array for unused * ifindexes. The setup and packet enqueue/send code is shared between the two * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; spinlock_t index_lock; unsigned int items; u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct hlist_head *dev_map_create_hash(unsigned int entries, int numa_node) { int i; struct hlist_head *hash; hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, int idx) { return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } static int dev_map_alloc_check(union bpf_attr *attr) { u32 valsize = attr->value_size; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex * 8 bytes: ifindex + prog fd */ if (attr->max_entries == 0 || attr->key_size != 4 || (valsize != offsetofend(struct bpf_devmap_val, ifindex) && valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2; roundup_pow_of_two() * can overflow into UB on 32-bit arches */ if (attr->max_entries > 1UL << 31) return -EINVAL; } return 0; } static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Hash table size must be power of 2 */ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) return -ENOMEM; } return 0; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err; dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { bpf_map_area_free(dtab); return ERR_PTR(err); } spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; } static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. The following synchronize_rcu() guarantees * both rcu read critical sections complete and waits for * preempt-disable regions (NAPI being the relevant context here) so we * are certain there will be no further reads against the netdev_map and * all flush operations are complete. Flush operations can only be done * from NAPI context for this reason. */ spin_lock(&dev_map_lock); list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() * during NAPI callback and cleared after the XDP redirect. There is no * explicit RCU read section which protects bpf_redirect_info->map but * local_bh_disable() also marks the beginning an RCU section. This * makes the complete softirq callback RCU protected. Thus after * following synchronize_rcu() there no bpf_redirect_info->map == map * assignment. */ synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } bpf_map_area_free(dtab->netdev_map); } bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; return 0; } if (index == dtab->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; hlist_for_each_entry_rcu(dev, head, index_hlist, lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; return NULL; } static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 idx, *next = next_key; struct bpf_dtab_netdev *dev, *next_dev; struct hlist_head *head; int i = 0; if (!key) goto find_first; idx = *(u32 *)key; dev = __dev_map_hash_lookup_elem(map, idx); if (!dev) goto find_first; next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } i = idx & (dtab->n_buckets - 1); i++; find_first: for (; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } } return -ENOENT; } static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *tx_dev, struct net_device *rx_dev) { struct xdp_txq_info txq = { .dev = tx_dev }; struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (unlikely(err < 0)) xdp_return_frame_rx_napi(xdpf); else frames[nframes++] = xdpf; break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); break; } } return nframes; /* sent frames count */ } static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; int sent = 0, err = 0; int to_send = cnt; int i; if (unlikely(!cnt)) return; for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } if (bq->xdp_prog) { to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. */ err = sent; sent = 0; } /* If not all frames have been transmitted, it is our * responsibility to free them */ for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); out: bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ void __dev_flush(struct list_head *flush_list) { struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); } } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; obj = rcu_dereference_check(dtab->netdev_map[key], rcu_read_lock_bh_held()); return obj; } /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. * * Do the same with xdp_prog and flush_list since these fields * are only ever modified together. */ if (!bq->dev_rx) { struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list(); bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); } bq->q[bq->count++] = xdpf; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { int err; if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return -EOPNOTSUPP; if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return -EOPNOTSUPP; err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) { struct xdp_txq_info txq = { .dev = dst->dev }; struct xdp_buff xdp; u32 act; if (!dst->xdp_prog) return XDP_PASS; __skb_pull(skb, skb->mac_len); xdp.txq = &txq; act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); switch (act) { case XDP_PASS: __skb_push(skb, skb->mac_len); break; default: bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); fallthrough; case XDP_DROP: kfree_skb(skb); break; } return act; } int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj) return false; if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return false; if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return false; if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; } static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, struct net_device *dev_rx, struct xdp_frame *xdpf) { struct xdp_frame *nxdpf; nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); return 0; } static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) { while (num_excluded--) { if (ifindex == excluded[num_excluded]) return true; } return false; } /* Get ifindex of each upper device. 'indexes' must be able to hold at * least MAX_NEST_DEV elements. * Returns the number of ifindexes added. */ static int get_upper_ifindexes(struct net_device *dev, int *indexes) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { indexes[n++] = upper->ifindex; } return n; } int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); excluded_devices[num_excluded++] = dev_rx->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } } /* consume the last copy of the frame */ if (last_dst) bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); else xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { int err; err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. */ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) return 0; skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; err = dev_map_generic_redirect(dst, nskb, xdp_prog); if (unlikely(err)) { consume_skb(nskb); return err; } return 0; } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, const struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev, excluded_devices); excluded_devices[num_excluded++] = dev->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } } /* consume the first skb and return */ if (last_dst) return dev_map_generic_redirect(last_dst, skb, xdp_prog); /* dtab is empty */ consume_skb(skb); return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } return 0; } static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, k); if (old_dev) { dtab->items--; hlist_del_init_rcu(&old_dev->index_hlist); call_rcu(&old_dev->rcu, __dev_map_entry_free); ret = 0; } spin_unlock_irqrestore(&dtab->index_lock, flags); return ret; } static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, struct bpf_devmap_val *val, unsigned int idx) { struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; if (prog->expected_attach_type != BPF_XDP_DEVMAP || !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } dev->idx = idx; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; } else { dev->xdp_prog = NULL; dev->val.bpf_prog.id = 0; } dev->val.ifindex = val->ifindex; return dev; err_put_prog: bpf_prog_put(prog); err_put_dev: dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); } static long __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the * net device is removed. */ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); else atomic_inc((atomic_t *)&dtab->items); return 0; } static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); } else { if (dtab->items >= dtab->map.max_entries) { spin_unlock_irqrestore(&dtab->index_lock, flags); call_rcu(&dev->rcu, __dev_map_entry_free); return -E2BIG; } dtab->items++; } hlist_add_head_rcu(&dev->index_hlist, dev_map_index_hash(dtab, idx)); spin_unlock_irqrestore(&dtab->index_lock, flags); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; out_err: spin_unlock_irqrestore(&dtab->index_lock, flags); return err; } static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_hash_lookup_elem); } static u64 dev_map_mem_usage(const struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u64 usage = sizeof(struct bpf_dtab); if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); else usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); usage += atomic_read((atomic_t *)&dtab->items) * (u64)sizeof(struct bpf_dtab_netdev); return usage; } BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = dev_map_alloc_check, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, struct net_device *netdev) { unsigned long flags; u32 i; spin_lock_irqsave(&dtab->index_lock, flags); for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { if (netdev != dev->dev) continue; dtab->items--; hlist_del_rcu(&dev->index_hlist); call_rcu(&dev->rcu, __dev_map_entry_free); } } spin_unlock_irqrestore(&dtab->index_lock, flags); } static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; int i, cpu; switch (event) { case NETDEV_REGISTER: if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) break; /* will be freed in free_netdev() */ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; for_each_possible_cpu(cpu) per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete * operation does not free a netdev_map entry while we * are comparing it against the netdev being unregistered. */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dev_map_hash_remove_netdev(dtab, netdev); continue; } for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } } } rcu_read_unlock(); break; default: break; } return NOTIFY_OK; } static struct notifier_block dev_map_notifier = { .notifier_call = dev_map_notification, }; static int __init dev_map_init(void) { /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } subsys_initcall(dev_map_init); |
9 7 1 4 5 2 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sm3_base.h - core logic for SM3 implementations * * Copyright (C) 2017 ARM Limited or its affiliates. * Written by Gilad Ben-Yossef <gilad@benyossef.com> */ #ifndef _CRYPTO_SM3_BASE_H #define _CRYPTO_SM3_BASE_H #include <crypto/internal/hash.h> #include <crypto/sm3.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> typedef void (sm3_block_fn)(struct sm3_state *sst, u8 const *src, int blocks); static inline int sm3_base_init(struct shash_desc *desc) { struct sm3_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SM3_IVA; sctx->state[1] = SM3_IVB; sctx->state[2] = SM3_IVC; sctx->state[3] = SM3_IVD; sctx->state[4] = SM3_IVE; sctx->state[5] = SM3_IVF; sctx->state[6] = SM3_IVG; sctx->state[7] = SM3_IVH; sctx->count = 0; return 0; } static inline int sm3_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sm3_block_fn *block_fn) { struct sm3_state *sctx = shash_desc_ctx(desc); unsigned int partial = sctx->count % SM3_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SM3_BLOCK_SIZE)) { int blocks; if (partial) { int p = SM3_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buffer, 1); } blocks = len / SM3_BLOCK_SIZE; len %= SM3_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SM3_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buffer + partial, data, len); return 0; } static inline int sm3_base_do_finalize(struct shash_desc *desc, sm3_block_fn *block_fn) { const int bit_offset = SM3_BLOCK_SIZE - sizeof(__be64); struct sm3_state *sctx = shash_desc_ctx(desc); __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); unsigned int partial = sctx->count % SM3_BLOCK_SIZE; sctx->buffer[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buffer + partial, 0x0, SM3_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buffer, 1); } memset(sctx->buffer + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buffer, 1); return 0; } static inline int sm3_base_finish(struct shash_desc *desc, u8 *out) { struct sm3_state *sctx = shash_desc_ctx(desc); __be32 *digest = (__be32 *)out; int i; for (i = 0; i < SM3_DIGEST_SIZE / sizeof(__be32); i++) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } #endif /* _CRYPTO_SM3_BASE_H */ |
8749 1119 8762 362 364 246 144 249 247 381 362 382 381 382 185 184 2 2 158 157 117 158 158 117 117 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup); |
26 26 17 17 17 17 26 10 26 8 20 17 17 24 24 19 19 19 4 43 19 32 12 43 4 4 37 41 41 41 41 7 6 41 21 26 26 43 43 26 26 14 14 16 17 1 1 2 13 12 2 14 14 14 17 17 24 24 24 24 24 1 2 2 3 4 1 2 2 15 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_sfb.c Stochastic Fair Blue * * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> * * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: * A New Class of Active Queue Management Algorithms. * U. Michigan CSE-TR-387-99, April 1999. * * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/siphash.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/inet_ecn.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) * This implementation uses L = 8 and N = 16 * This permits us to split one 32bit hash (provided per packet by rxhash or * external classifier) into 8 subhashes of 4 bits. */ #define SFB_BUCKET_SHIFT 4 #define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ #define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) #define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ /* SFB algo uses a virtual queue, named "bin" */ struct sfb_bucket { u16 qlen; /* length of virtual queue */ u16 p_mark; /* marking probability */ }; /* We use a double buffering right before hash change * (Section 4.4 of SFB reference : moving hash functions) */ struct sfb_bins { siphash_key_t perturbation; /* siphash key */ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; }; struct sfb_sched_data { struct Qdisc *qdisc; struct tcf_proto __rcu *filter_list; struct tcf_block *block; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; u32 bin_size; /* maximum queue length per bin */ u32 increment; /* d1 */ u32 decrement; /* d2 */ u32 limit; /* HARD maximal queue length */ u32 penalty_rate; u32 penalty_burst; u32 tokens_avail; unsigned long rehash_time; unsigned long token_time; u8 slot; /* current active bins (0 or 1) */ bool double_buffering; struct sfb_bins bins[2]; struct { u32 earlydrop; u32 penaltydrop; u32 bucketdrop; u32 queuedrop; u32 childdrop; /* drops in child qdisc */ u32 marked; /* ECN mark */ } stats; }; /* * Each queued skb might be hashed on one or two bins * We store in skb_cb the two hash values. * (A zero value means double buffering was not used) */ struct sfb_skb_cb { u32 hashes[2]; }; static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } /* * If using 'internal' SFB flow classifier, hash comes from skb rxhash * If using external classifier, hash comes from the classid. */ static u32 sfb_hash(const struct sk_buff *skb, u32 slot) { return sfb_skb_cb(skb)->hashes[slot]; } /* Probabilities are coded as Q0.16 fixed-point values, * with 0xFFFF representing 65535/65536 (almost 1.0) * Addition and subtraction are saturating in [0, 65535] */ static u32 prob_plus(u32 p1, u32 p2) { u32 res = p1 + p2; return min_t(u32, res, SFB_MAX_PROB); } static u32 prob_minus(u32 p1, u32 p2) { return p1 > p2 ? p1 - p2 : 0; } static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen < 0xFFFF) b[hash].qlen++; b += SFB_NUMBUCKETS; /* next level */ } } static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } static void decrement_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen > 0) b[hash].qlen--; b += SFB_NUMBUCKETS; /* next level */ } } static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = sfb_hash(skb, 0); if (sfbhash) decrement_one_qlen(sfbhash, 0, q); sfbhash = sfb_hash(skb, 1); if (sfbhash) decrement_one_qlen(sfbhash, 1, q); } static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_minus(b->p_mark, q->decrement); } static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_plus(b->p_mark, q->increment); } static void sfb_zero_all_buckets(struct sfb_sched_data *q) { memset(&q->bins, 0, sizeof(q->bins)); } /* * compute max qlen, max p_mark, and avg p_mark */ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) { int i; u32 qlen = 0, prob = 0, totalpm = 0; const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { if (qlen < b->qlen) qlen = b->qlen; totalpm += b->p_mark; if (prob < b->p_mark) prob = b->p_mark; b++; } *prob_r = prob; *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); return qlen; } static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { get_random_bytes(&q->bins[slot].perturbation, sizeof(q->bins[slot].perturbation)); } static void sfb_swap_slot(struct sfb_sched_data *q) { sfb_init_perturbation(q->slot, q); q->slot ^= 1; q->double_buffering = false; } /* Non elastic flows are allowed to use part of the bandwidth, expressed * in "penalty_rate" packets per second, with "penalty_burst" burst */ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) { if (q->penalty_rate == 0 || q->penalty_burst == 0) return true; if (q->tokens_avail < 1) { unsigned long age = min(10UL * HZ, jiffies - q->token_time); q->tokens_avail = (age * q->penalty_rate) / HZ; if (q->tokens_avail > q->penalty_burst) q->tokens_avail = q->penalty_burst; q->token_time = jiffies; if (q->tokens_avail < 1) return true; } q->tokens_avail--; return false; } static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return false; } #endif *salt = TC_H_MIN(res.classid); return true; } return false; } static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; u32 r, sfbhash; u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } if (q->rehash_interval > 0) { unsigned long limit = q->rehash_time + q->rehash_interval; if (unlikely(time_after(jiffies, limit))) { sfb_swap_slot(q); q->rehash_time = jiffies; } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && time_after(jiffies, limit - q->warmup_time))) { q->double_buffering = true; } } fl = rcu_dereference_bh(q->filter_list); if (fl) { u32 salt; /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation); } else { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); } if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); if (minqlen > b->qlen) minqlen = b->qlen; if (p_min > b->p_mark) p_min = b->p_mark; } slot ^= 1; sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); } } if (sfb_rate_limit(skb, q)) { qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } goto enqueue; } r = get_random_u16() & SFB_MAX_PROB; reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { /* If we're marking that many packets, then either * this flow is unresponsive, or we're badly congested. * In either case, we want to start dropping packets. */ if (r < (p_min - SFB_MAX_PROB / 2) * 2) { q->stats.earlydrop++; goto drop; } } if (INET_ECN_set_ce(skb)) { q->stats.marked++; } else { q->stats.earlydrop++; goto drop; } } enqueue: memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); } return ret; drop: qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } static struct sk_buff *sfb_dequeue(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct sk_buff *skb; skb = child->dequeue(q->qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } return skb; } static struct sk_buff *sfb_peek(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; return child->ops->peek(child); } /* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ static void sfb_reset(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); if (likely(q->qdisc)) qdisc_reset(q->qdisc); q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); } static void sfb_destroy(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); qdisc_put(q->qdisc); } static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, }; static const struct tc_sfb_qopt sfb_default_ops = { .rehash_interval = 600 * MSEC_PER_SEC, .warmup_time = 60 * MSEC_PER_SEC, .limit = 0, .max = 25, .bin_size = 20, .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ .decrement = (SFB_MAX_PROB + 3000) / 6000, .penalty_rate = 10, .penalty_burst = 20, }; static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; int err; if (opt) { err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt, sfb_policy, NULL); if (err < 0) return -EINVAL; if (tb[TCA_SFB_PARMS] == NULL) return -EINVAL; ctl = nla_data(tb[TCA_SFB_PARMS]); } limit = ctl->limit; if (limit == 0) limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack); if (IS_ERR(child)) return PTR_ERR(child); if (child != &noop_qdisc) qdisc_hash_add(child, true); sch_tree_lock(sch); qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); q->warmup_time = msecs_to_jiffies(ctl->warmup_time); q->rehash_time = jiffies; q->limit = limit; q->increment = ctl->increment; q->decrement = ctl->decrement; q->max = ctl->max; q->bin_size = ctl->bin_size; q->penalty_rate = ctl->penalty_rate; q->penalty_burst = ctl->penalty_burst; q->tokens_avail = ctl->penalty_burst; q->token_time = jiffies; q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); sfb_init_perturbation(1, q); sch_tree_unlock(sch); qdisc_put(old); return 0; } static int sfb_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->qdisc = &noop_qdisc; return sfb_change(sch, opt, extack); } static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfb_sched_data *q = qdisc_priv(sch); struct nlattr *opts; struct tc_sfb_qopt opt = { .rehash_interval = jiffies_to_msecs(q->rehash_interval), .warmup_time = jiffies_to_msecs(q->warmup_time), .limit = q->limit, .max = q->max, .bin_size = q->bin_size, .increment = q->increment, .decrement = q->decrement, .penalty_rate = q->penalty_rate, .penalty_burst = q->penalty_burst, }; sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct sfb_sched_data *q = qdisc_priv(sch); struct tc_sfb_xstats st = { .earlydrop = q->stats.earlydrop, .penaltydrop = q->stats.penaltydrop, .bucketdrop = q->stats.bucketdrop, .queuedrop = q->stats.queuedrop, .childdrop = q->stats.childdrop, .marked = q->stats.marked, }; st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); return gnet_stats_copy_app(d, &st, sizeof(st)); } static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { return -ENOSYS; } static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) { struct sfb_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long sfb_find(struct Qdisc *sch, u32 classid) { return 1; } static void sfb_unbind(struct Qdisc *sch, unsigned long arg) { } static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { return -ENOSYS; } static int sfb_delete(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { return -ENOSYS; } static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static const struct Qdisc_class_ops sfb_class_ops = { .graft = sfb_graft, .leaf = sfb_leaf, .find = sfb_find, .change = sfb_change_class, .delete = sfb_delete, .walk = sfb_walk, .tcf_block = sfb_tcf_block, .bind_tcf = sfb_bind, .unbind_tcf = sfb_unbind, .dump = sfb_dump_class, }; static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { .id = "sfb", .priv_size = sizeof(struct sfb_sched_data), .cl_ops = &sfb_class_ops, .enqueue = sfb_enqueue, .dequeue = sfb_dequeue, .peek = sfb_peek, .init = sfb_init, .reset = sfb_reset, .destroy = sfb_destroy, .change = sfb_change, .dump = sfb_dump, .dump_stats = sfb_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("sfb"); static int __init sfb_module_init(void) { return register_qdisc(&sfb_qdisc_ops); } static void __exit sfb_module_exit(void) { unregister_qdisc(&sfb_qdisc_ops); } module_init(sfb_module_init) module_exit(sfb_module_exit) MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); MODULE_AUTHOR("Juliusz Chroboczek"); MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL"); |
4 1 42 23 42 42 42 56 2 7 58 38 82 38 9 5 1 2 128 9 2 84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); #define NET_CLS_ALIAS_PREFIX "net-cls-" #define MODULE_ALIAS_NET_CLS(kind) MODULE_ALIAS(NET_CLS_ALIAS_PREFIX kind) struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { return block && !atomic_read(&block->useswcnt); } #endif static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaluate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) { if (tp->usesw) return; if (tc_skip_sw(flags) && tc_in_hw(flags)) return; tp->usesw = true; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif |
2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 | // SPDX-License-Identifier: GPL-2.0-only /* * HID raw devices, giving access to raw HID events. * * In comparison to hiddev, this device does not process the * hid events at all (no parsing, no lookups). This lets applications * to work on raw hid events as they want to, and avoids a need to * use a transport-specific userspace libhid/libusb libraries. * * Copyright (c) 2007-2014 Jiri Kosina */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/cdev.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/mutex.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/hidraw.h> static int hidraw_major; static struct cdev hidraw_cdev; static const struct class hidraw_class = { .name = "hidraw", }; static struct hidraw *hidraw_table[HIDRAW_MAX_DEVICES]; static DECLARE_RWSEM(minors_rwsem); static inline bool hidraw_is_revoked(struct hidraw_list *list) { return list->revoked; } static ssize_t hidraw_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct hidraw_list *list = file->private_data; int ret = 0, len; DECLARE_WAITQUEUE(wait, current); if (hidraw_is_revoked(list)) return -ENODEV; mutex_lock(&list->read_mutex); while (ret == 0) { if (list->head == list->tail) { add_wait_queue(&list->hidraw->wait, &wait); set_current_state(TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!list->hidraw->exist) { ret = -EIO; break; } if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } /* allow O_NONBLOCK to work well from other threads */ mutex_unlock(&list->read_mutex); schedule(); mutex_lock(&list->read_mutex); set_current_state(TASK_INTERRUPTIBLE); } set_current_state(TASK_RUNNING); remove_wait_queue(&list->hidraw->wait, &wait); } if (ret) goto out; len = list->buffer[list->tail].len > count ? count : list->buffer[list->tail].len; if (list->buffer[list->tail].value) { if (copy_to_user(buffer, list->buffer[list->tail].value, len)) { ret = -EFAULT; goto out; } ret = len; } kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } out: mutex_unlock(&list->read_mutex); return ret; } /* * The first byte of the report buffer is expected to be a report number. */ static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = memdup_user(buffer, count); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out; } if ((report_type == HID_OUTPUT_REPORT) && !(dev->quirks & HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP)) { ret = __hid_hw_output_report(dev, buf, count, (u64)(long)file, false); /* * compatibility with old implementation of USB-HID and I2C-HID: * if the device does not support receiving output reports, * on an interrupt endpoint, fallback to SET_REPORT HID command. */ if (ret != -ENOSYS) goto out_free; } ret = __hid_hw_raw_request(dev, buf[0], buf, count, report_type, HID_REQ_SET_REPORT, (u64)(long)file, false); out_free: kfree(buf); out: return ret; } static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct hidraw_list *list = file->private_data; ssize_t ret; down_read(&minors_rwsem); if (hidraw_is_revoked(list)) ret = -ENODEV; else ret = hidraw_send_report(file, buffer, count, HID_OUTPUT_REPORT); up_read(&minors_rwsem); return ret; } /* * This function performs a Get_Report transfer over the control endpoint * per section 7.2.1 of the HID specification, version 1.1. The first byte * of buffer is the report number to request, or 0x0 if the device does not * use numbered reports. The report_type parameter can be HID_FEATURE_REPORT * or HID_INPUT_REPORT. */ static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t count, unsigned char report_type) { unsigned int minor = iminor(file_inode(file)); struct hid_device *dev; __u8 *buf; int ret = 0, len; unsigned char report_number; lockdep_assert_held(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { ret = -ENODEV; goto out; } dev = hidraw_table[minor]->hid; if (!dev->ll_driver->raw_request) { ret = -ENODEV; goto out; } if (count > HID_MAX_BUFFER_SIZE) { hid_warn(dev, "pid %d passed too large report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } if (count < 2) { hid_warn(dev, "pid %d passed too short report\n", task_pid_nr(current)); ret = -EINVAL; goto out; } buf = kmalloc(count, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto out; } /* * Read the first byte from the user. This is the report number, * which is passed to hid_hw_raw_request(). */ if (copy_from_user(&report_number, buffer, 1)) { ret = -EFAULT; goto out_free; } ret = __hid_hw_raw_request(dev, report_number, buf, count, report_type, HID_REQ_GET_REPORT, (u64)(long)file, false); if (ret < 0) goto out_free; len = (ret < count) ? ret : count; if (copy_to_user(buffer, buf, len)) { ret = -EFAULT; goto out_free; } ret = len; out_free: kfree(buf); out: return ret; } static __poll_t hidraw_poll(struct file *file, poll_table *wait) { struct hidraw_list *list = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* hidraw is always writable */ poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) mask |= EPOLLIN | EPOLLRDNORM; if (!list->hidraw->exist || hidraw_is_revoked(list)) mask |= EPOLLERR | EPOLLHUP; return mask; } static int hidraw_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct hidraw *dev; struct hidraw_list *list; unsigned long flags; int err = 0; if (!(list = kzalloc(sizeof(struct hidraw_list), GFP_KERNEL))) { err = -ENOMEM; goto out; } /* * Technically not writing to the hidraw_table but a write lock is * required to protect the device refcount. This is symmetrical to * hidraw_release(). */ down_write(&minors_rwsem); if (!hidraw_table[minor] || !hidraw_table[minor]->exist) { err = -ENODEV; goto out_unlock; } dev = hidraw_table[minor]; if (!dev->open++) { err = hid_hw_power(dev->hid, PM_HINT_FULLON); if (err < 0) { dev->open--; goto out_unlock; } err = hid_hw_open(dev->hid); if (err < 0) { hid_hw_power(dev->hid, PM_HINT_NORMAL); dev->open--; goto out_unlock; } } list->hidraw = hidraw_table[minor]; mutex_init(&list->read_mutex); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); list_add_tail(&list->node, &hidraw_table[minor]->list); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); file->private_data = list; out_unlock: up_write(&minors_rwsem); out: if (err < 0) kfree(list); return err; } static int hidraw_fasync(int fd, struct file *file, int on) { struct hidraw_list *list = file->private_data; if (hidraw_is_revoked(list)) return -ENODEV; return fasync_helper(fd, file, on, &list->fasync); } static void drop_ref(struct hidraw *hidraw, int exists_bit) { if (exists_bit) { hidraw->exist = 0; if (hidraw->open) { hid_hw_close(hidraw->hid); wake_up_interruptible(&hidraw->wait); } device_destroy(&hidraw_class, MKDEV(hidraw_major, hidraw->minor)); } else { --hidraw->open; } if (!hidraw->open) { if (!hidraw->exist) { hidraw_table[hidraw->minor] = NULL; kfree(hidraw); } else { /* close device for last reader */ hid_hw_close(hidraw->hid); hid_hw_power(hidraw->hid, PM_HINT_NORMAL); } } } static int hidraw_release(struct inode * inode, struct file * file) { unsigned int minor = iminor(inode); struct hidraw_list *list = file->private_data; unsigned long flags; down_write(&minors_rwsem); spin_lock_irqsave(&hidraw_table[minor]->list_lock, flags); while (list->tail != list->head) { kfree(list->buffer[list->tail].value); list->buffer[list->tail].value = NULL; list->tail = (list->tail + 1) & (HIDRAW_BUFFER_SIZE - 1); } list_del(&list->node); spin_unlock_irqrestore(&hidraw_table[minor]->list_lock, flags); kfree(list); drop_ref(hidraw_table[minor], 0); up_write(&minors_rwsem); return 0; } static int hidraw_revoke(struct hidraw_list *list) { list->revoked = true; return 0; } static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); unsigned int minor = iminor(inode); long ret = 0; struct hidraw *dev; struct hidraw_list *list = file->private_data; void __user *user_arg = (void __user*) arg; down_read(&minors_rwsem); dev = hidraw_table[minor]; if (!dev || !dev->exist || hidraw_is_revoked(list)) { ret = -ENODEV; goto out; } switch (cmd) { case HIDIOCGRDESCSIZE: if (put_user(dev->hid->rsize, (int __user *)arg)) ret = -EFAULT; break; case HIDIOCGRDESC: { __u32 len; if (get_user(len, (int __user *)arg)) ret = -EFAULT; else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) ret = -EINVAL; else if (copy_to_user(user_arg + offsetof( struct hidraw_report_descriptor, value[0]), dev->hid->rdesc, min(dev->hid->rsize, len))) ret = -EFAULT; break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; dinfo.bustype = dev->hid->bus; dinfo.vendor = dev->hid->vendor; dinfo.product = dev->hid->product; if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) ret = -EFAULT; break; } case HIDIOCREVOKE: { if (user_arg) ret = -EINVAL; else ret = hidraw_revoke(list); break; } default: { struct hid_device *hid = dev->hid; if (_IOC_TYPE(cmd) != 'H') { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { int len = _IOC_SIZE(cmd); ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); break; } /* Begin Read-only ioctls. */ if (_IOC_DIR(cmd) != _IOC_READ) { ret = -EINVAL; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { int len = strlen(hid->uniq) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); ret = copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; break; } } ret = -ENOTTY; } out: up_read(&minors_rwsem); return ret; } static const struct file_operations hidraw_ops = { .owner = THIS_MODULE, .read = hidraw_read, .write = hidraw_write, .poll = hidraw_poll, .open = hidraw_open, .release = hidraw_release, .unlocked_ioctl = hidraw_ioctl, .fasync = hidraw_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; int hidraw_report_event(struct hid_device *hid, u8 *data, int len) { struct hidraw *dev = hid->hidraw; struct hidraw_list *list; int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->list_lock, flags); list_for_each_entry(list, &dev->list, node) { int new_head = (list->head + 1) & (HIDRAW_BUFFER_SIZE - 1); if (hidraw_is_revoked(list) || new_head == list->tail) continue; if (!(list->buffer[list->head].value = kmemdup(data, len, GFP_ATOMIC))) { ret = -ENOMEM; break; } list->buffer[list->head].len = len; list->head = new_head; kill_fasync(&list->fasync, SIGIO, POLL_IN); } spin_unlock_irqrestore(&dev->list_lock, flags); wake_up_interruptible(&dev->wait); return ret; } EXPORT_SYMBOL_GPL(hidraw_report_event); int hidraw_connect(struct hid_device *hid) { int minor, result; struct hidraw *dev; /* we accept any HID device, all applications */ dev = kzalloc(sizeof(struct hidraw), GFP_KERNEL); if (!dev) return -ENOMEM; result = -EINVAL; down_write(&minors_rwsem); for (minor = 0; minor < HIDRAW_MAX_DEVICES; minor++) { if (hidraw_table[minor]) continue; hidraw_table[minor] = dev; result = 0; break; } if (result) { up_write(&minors_rwsem); kfree(dev); goto out; } dev->dev = device_create(&hidraw_class, &hid->dev, MKDEV(hidraw_major, minor), NULL, "%s%d", "hidraw", minor); if (IS_ERR(dev->dev)) { hidraw_table[minor] = NULL; up_write(&minors_rwsem); result = PTR_ERR(dev->dev); kfree(dev); goto out; } init_waitqueue_head(&dev->wait); spin_lock_init(&dev->list_lock); INIT_LIST_HEAD(&dev->list); dev->hid = hid; dev->minor = minor; dev->exist = 1; hid->hidraw = dev; up_write(&minors_rwsem); out: return result; } EXPORT_SYMBOL_GPL(hidraw_connect); void hidraw_disconnect(struct hid_device *hid) { struct hidraw *hidraw = hid->hidraw; down_write(&minors_rwsem); drop_ref(hidraw, 1); up_write(&minors_rwsem); } EXPORT_SYMBOL_GPL(hidraw_disconnect); int __init hidraw_init(void) { int result; dev_t dev_id; result = alloc_chrdev_region(&dev_id, HIDRAW_FIRST_MINOR, HIDRAW_MAX_DEVICES, "hidraw"); if (result < 0) { pr_warn("can't get major number\n"); goto out; } hidraw_major = MAJOR(dev_id); result = class_register(&hidraw_class); if (result) goto error_cdev; cdev_init(&hidraw_cdev, &hidraw_ops); result = cdev_add(&hidraw_cdev, dev_id, HIDRAW_MAX_DEVICES); if (result < 0) goto error_class; pr_info("raw HID events driver (C) Jiri Kosina\n"); out: return result; error_class: class_unregister(&hidraw_class); error_cdev: unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); goto out; } void hidraw_exit(void) { dev_t dev_id = MKDEV(hidraw_major, 0); cdev_del(&hidraw_cdev); class_unregister(&hidraw_class); unregister_chrdev_region(dev_id, HIDRAW_MAX_DEVICES); } |
4 4 5 5 1 5 5 5 1 1 1 1 1 4 3 4 4 1 4 1 3 4 3 3 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct fec_req_info { struct ethnl_req_info base; }; struct fec_reply_data { struct ethnl_reply_data base; __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); u32 active_fec; u8 fec_auto; struct fec_stat_grp { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; }; #define FEC_REPDATA(__reply_base) \ container_of(__reply_base, struct fec_reply_data, base) #define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static void ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto) { if (fec_auto) *fec_auto = !!(fec & ETHTOOL_FEC_AUTO); if (fec & ETHTOOL_FEC_OFF) __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes); if (fec & ETHTOOL_FEC_RS) __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes); if (fec & ETHTOOL_FEC_BASER) __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes); if (fec & ETHTOOL_FEC_LLRS) __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes); } static int ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, unsigned long *link_modes, u8 fec_auto) { memset(fec, 0, sizeof(*fec)); if (fec_auto) fec->fec |= ETHTOOL_FEC_AUTO; if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes)) fec->fec |= ETHTOOL_FEC_OFF; if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes)) fec->fec |= ETHTOOL_FEC_RS; if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes)) fec->fec |= ETHTOOL_FEC_BASER; if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes)) fec->fec |= ETHTOOL_FEC_LLRS; if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS)) return -EINVAL; return 0; } static void fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) { int i; if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) { grp->stats[0] = stats->total; grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET; return; } grp->cnt = 1; grp->stats[0] = 0; for (i = 0; i < ETHTOOL_MAX_LANES; i++) { if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET) break; grp->stats[0] += stats->lanes[i]; grp->stats[grp->cnt++] = stats->lanes[i]; } } static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; struct fec_reply_data *data = FEC_REPDATA(reply_base); struct net_device *dev = reply_base->dev; struct ethtool_fecparam fec = {}; int ret; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret) goto out_complete; if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_fec_stats) { struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); dev->ethtool_ops->get_fec_stats(dev, &stats); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); fec_stats_recalc(&data->corr_bits, &stats.corrected_bits); } WARN_ON_ONCE(fec.reserved); ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes, &data->fec_auto); ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL); data->active_fec = find_first_bit(active_fec_modes, __ETHTOOL_LINK_MODE_MASK_NBITS); /* Don't report attr if no FEC mode set. Note that * ethtool_fecparam_to_link_modes() ignores NONE and AUTO. */ if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS) data->active_fec = 0; out_complete: ethnl_ops_complete(dev); return ret; } static int fec_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct fec_reply_data *data = FEC_REPDATA(reply_base); int len = 0; int ret; ret = ethnl_bitset_size(data->fec_link_modes, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; len += ret; len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ if (req_base->flags & ETHTOOL_FLAG_STATS) len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); return len; } static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS); if (!nest) return -EMSGSIZE; if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED, sizeof(u64) * data->corr.cnt, data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) || nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR, sizeof(u64) * data->uncorr.cnt, data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) || nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS, sizeof(u64) * data->corr_bits.cnt, data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int fec_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct fec_reply_data *data = FEC_REPDATA(reply_base); int ret; ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES, data->fec_link_modes, NULL, __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names, compact); if (ret < 0) return ret; if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) || (data->active_fec && nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data)) return -EMSGSIZE; return 0; } /* FEC_SET */ const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED }, [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), }; static int ethnl_set_fec_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_fecparam && ops->set_fecparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_fec(struct ethnl_req_info *req_info, struct genl_info *info) { __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; struct ethtool_fecparam fec = {}; bool mod = false; u8 fec_auto; int ret; ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret < 0) return ret; ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); ret = ethnl_update_bitset(fec_link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS, tb[ETHTOOL_A_FEC_MODES], link_mode_names, info->extack, &mod); if (ret < 0) return ret; ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); if (!mod) return 0; ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); if (ret) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "invalid FEC modes requested"); return ret; } if (!fec.fec) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], "no FEC modes set"); return -EINVAL; } ret = dev->ethtool_ops->set_fecparam(dev, &fec); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_fec_request_ops = { .request_cmd = ETHTOOL_MSG_FEC_GET, .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, .hdr_attr = ETHTOOL_A_FEC_HEADER, .req_info_size = sizeof(struct fec_req_info), .reply_data_size = sizeof(struct fec_reply_data), .prepare_data = fec_prepare_data, .reply_size = fec_reply_size, .fill_reply = fec_fill_reply, .set_validate = ethnl_set_fec_validate, .set = ethnl_set_fec, .set_ntf_cmd = ETHTOOL_MSG_FEC_NTF, }; |
31 1509 1516 123 130 130 91 91 129 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_ #define _NET_BATMAN_ADV_HARD_INTERFACE_H_ #include "main.h" #include <linux/compiler.h> #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> /** * enum batadv_hard_if_state - State of a hard interface */ enum batadv_hard_if_state { /** * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a * batman-adv soft interface */ BATADV_IF_NOT_IN_USE, /** * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft * interface */ BATADV_IF_TO_BE_REMOVED, /** @BATADV_IF_INACTIVE: interface is deactivated */ BATADV_IF_INACTIVE, /** @BATADV_IF_ACTIVE: interface is used */ BATADV_IF_ACTIVE, /** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */ BATADV_IF_TO_BE_ACTIVATED, }; /** * enum batadv_hard_if_bcast - broadcast avoidance options */ enum batadv_hard_if_bcast { /** @BATADV_HARDIF_BCAST_OK: Do broadcast on according hard interface */ BATADV_HARDIF_BCAST_OK = 0, /** * @BATADV_HARDIF_BCAST_NORECIPIENT: Broadcast not needed, there is no * recipient */ BATADV_HARDIF_BCAST_NORECIPIENT, /** * @BATADV_HARDIF_BCAST_DUPFWD: There is just the neighbor we got it * from */ BATADV_HARDIF_BCAST_DUPFWD, /** @BATADV_HARDIF_BCAST_DUPORIG: There is just the originator */ BATADV_HARDIF_BCAST_DUPORIG, }; extern struct notifier_block batadv_hard_if_notifier; struct net_device *batadv_get_real_netdev(struct net_device *net_device); bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface); bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); int batadv_hardif_min_mtu(struct net_device *soft_iface); void batadv_update_min_mtu(struct net_device *soft_iface); void batadv_hardif_release(struct kref *ref); int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh); /** * batadv_hardif_put() - decrement the hard interface refcounter and possibly * release it * @hard_iface: the hard interface to free */ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return; kref_put(&hard_iface->refcount, batadv_hardif_release); } /** * batadv_primary_if_get_selected() - Get reference to primary interface * @bat_priv: the bat priv with all the soft interface information * * Return: primary interface (with increased refcnt), otherwise NULL */ static inline struct batadv_hard_iface * batadv_primary_if_get_selected(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); hard_iface = rcu_dereference(bat_priv->primary_if); if (!hard_iface) goto out; if (!kref_get_unless_zero(&hard_iface->refcount)) hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } #endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */ |
713 93 93 288 427 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | #undef TRACE_SYSTEM #define TRACE_SYSTEM neigh #if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NEIGH_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <net/neighbour.h> #define neigh_state_str(state) \ __print_symbolic(state, \ { NUD_INCOMPLETE, "incomplete" }, \ { NUD_REACHABLE, "reachable" }, \ { NUD_STALE, "stale" }, \ { NUD_DELAY, "delay" }, \ { NUD_PROBE, "probe" }, \ { NUD_FAILED, "failed" }, \ { NUD_NOARP, "noarp" }, \ { NUD_PERMANENT, "permanent"}) TRACE_EVENT(neigh_create, TP_PROTO(struct neigh_table *tbl, struct net_device *dev, const void *pkey, const struct neighbour *n, bool exempt_from_gc), TP_ARGS(tbl, dev, pkey, n, exempt_from_gc), TP_STRUCT__entry( __field(u32, family) __string(dev, dev ? dev->name : "NULL") __field(int, entries) __field(u8, created) __field(u8, gc_exempt) __array(u8, primary_key4, 4) __array(u8, primary_key6, 16) ), TP_fast_assign( __be32 *p32; __entry->family = tbl->family; __assign_str(dev); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; p32 = (__be32 *)__entry->primary_key4; if (tbl->family == AF_INET) *p32 = *(__be32 *)pkey; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (tbl->family == AF_INET6) { struct in6_addr *pin6; pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)pkey; } #endif ), TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d", __entry->family, __get_str(dev), __entry->entries, __entry->primary_key4, __entry->primary_key6, __entry->created, __entry->gc_exempt) ); TRACE_EVENT(neigh_update, TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid), TP_ARGS(n, lladdr, new, flags, nlmsg_pid), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __array(u8, new_lladdr, MAX_ADDR_LEN) __field(u8, new_state) __field(u32, update_flags) __field(u32, pid) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; if (lladdr) memcpy(__entry->new_lladdr, lladdr, lladdr_len); __entry->new_state = new; __entry->update_flags = flags; __entry->pid = nlmsg_pid; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu new_lladdr %s " "new_state %s update_flags %02x pid %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __print_hex_str(__entry->new_lladdr, __entry->lladdr_len), neigh_state_str(__entry->new_state), __entry->update_flags, __entry->pid) ); DECLARE_EVENT_CLASS(neigh__update, TP_PROTO(struct neighbour *n, int err), TP_ARGS(n, err), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __field(u32, err) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; __entry->err = err; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu err %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __entry->err) ); DEFINE_EVENT(neigh__update, neigh_update_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_timer_handler, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_dead, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_cleanup_and_release, TP_PROTO(struct neighbour *neigh, int rc), TP_ARGS(neigh, rc) ); #endif /* _TRACE_NEIGH_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
10 2 8 8 8 8 11 6 3 10 1 7 1 1 1 1 1 11 5 6 3 5 3 3 1 1 8 3 7 7 7 2 5 4 1 1 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 | // SPDX-License-Identifier: GPL-2.0-only /* * File: af_phonet.c * * Phonet protocols family * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <net/sock.h> #include <linux/if_phonet.h> #include <linux/phonet.h> #include <net/phonet/phonet.h> #include <net/phonet/pn_dev.h> /* Transport protocol registration */ static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; rcu_read_lock(); pp = rcu_dereference(proto_tab[protocol]); if (pp && !try_module_get(pp->prot->owner)) pp = NULL; rcu_read_unlock(); return pp; } static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } /* protocol family functions */ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct pn_sock *pn; const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (protocol == 0) { /* Default protocol selection */ switch (sock->type) { case SOCK_DGRAM: protocol = PN_PROTO_PHONET; break; case SOCK_SEQPACKET: protocol = PN_PROTO_PIPE; break; default: return -EPROTONOSUPPORT; } } pnp = phonet_proto_get(protocol); if (pnp == NULL && request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0) pnp = phonet_proto_get(protocol); if (pnp == NULL) return -EPROTONOSUPPORT; if (sock->type != pnp->sock_type) { err = -EPROTONOSUPPORT; goto out; } sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern); if (sk == NULL) { err = -ENOMEM; goto out; } sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = pnp->ops; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_protocol = protocol; pn = pn_sk(sk); pn->sobject = 0; pn->dobject = 0; pn->resource = 0; sk->sk_prot->init(sk); err = 0; out: phonet_proto_put(pnp); return err; } static const struct net_proto_family phonet_proto_family = { .family = PF_PHONET, .create = pn_socket_create, .owner = THIS_MODULE, }; /* Phonet device header operations */ static int pn_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { u8 *media = skb_push(skb, 1); if (type != ETH_P_PHONET) return -1; if (!saddr) saddr = dev->dev_addr; *media = *(const u8 *)saddr; return 1; } static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const u8 *media = skb_mac_header(skb); *haddr = *media; return 1; } const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; EXPORT_SYMBOL(phonet_header_ops); /* * Prepends an ISI header and sends a datagram. */ static int pn_send(struct sk_buff *skb, struct net_device *dev, u16 dst, u16 src, u8 res) { struct phonethdr *ph; int err; if (skb->len + 2 > 0xffff /* Phonet length field limit */ || skb->len + sizeof(struct phonethdr) > dev->mtu) { err = -EMSGSIZE; goto drop; } /* Broadcast sending is not implemented */ if (pn_addr(dst) == PNADDR_BROADCAST) { err = -EOPNOTSUPP; goto drop; } skb_reset_transport_header(skb); WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */ skb_push(skb, sizeof(struct phonethdr)); skb_reset_network_header(skb); ph = pn_hdr(skb); ph->pn_rdev = pn_dev(dst); ph->pn_sdev = pn_dev(src); ph->pn_res = res; ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph)); ph->pn_robj = pn_obj(dst); ph->pn_sobj = pn_obj(src); skb->protocol = htons(ETH_P_PHONET); skb->priority = 0; skb->dev = dev; if (skb->pkt_type == PACKET_LOOPBACK) { skb_reset_mac_header(skb); skb_orphan(skb); err = netif_rx(skb) ? -ENOBUFS : 0; } else { err = dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len); if (err < 0) { err = -EHOSTUNREACH; goto drop; } err = dev_queue_xmit(skb); if (unlikely(err > 0)) err = net_xmit_errno(err); } return err; drop: kfree_skb(skb); return err; } static int pn_raw_send(const void *data, int len, struct net_device *dev, u16 dst, u16 src, u8 res) { struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0) skb->pkt_type = PACKET_LOOPBACK; skb_reserve(skb, MAX_PHONET_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, data, len); return pn_send(skb, dev, dst, src, res); } /* * Create a Phonet header for the skb and send it out. Returns * non-zero error code if failed. The skb is freed then. */ int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target) { struct net *net = sock_net(sk); struct net_device *dev; struct pn_sock *pn = pn_sk(sk); int err; u16 src, dst; u8 daddr, saddr, res; src = pn->sobject; if (target != NULL) { dst = pn_sockaddr_get_object(target); res = pn_sockaddr_get_resource(target); } else { dst = pn->dobject; res = pn->resource; } daddr = pn_addr(dst); err = -EHOSTUNREACH; if (sk->sk_bound_dev_if) dev = dev_get_by_index(net, sk->sk_bound_dev_if); else if (phonet_address_lookup(net, daddr) == 0) { dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else if (dst == 0) { /* Resource routing (small race until phonet_rcv()) */ struct sock *sk = pn_find_sock_by_res(net, res); if (sk) { sock_put(sk); dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; } else dev = phonet_route_output(net, daddr); } else dev = phonet_route_output(net, daddr); if (!dev || !(dev->flags & IFF_UP)) goto drop; saddr = phonet_address_get(dev, daddr); if (saddr == PN_NO_ADDR) goto drop; if (!pn_addr(src)) src = pn_object(saddr, pn_obj(src)); err = pn_send(skb, dev, dst, src, res); dev_put(dev); return err; drop: kfree_skb(skb); dev_put(dev); return err; } EXPORT_SYMBOL(pn_skb_send); /* Do not send an error message in response to an error message */ static inline int can_respond(struct sk_buff *skb) { const struct phonethdr *ph; const struct phonetmsg *pm; u8 submsg_id; if (!pskb_may_pull(skb, 3)) return 0; ph = pn_hdr(skb); if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5)) return 0; if (ph->pn_res == PN_COMMGR) /* indications */ return 0; ph = pn_hdr(skb); /* re-acquires the pointer */ pm = pn_msg(skb); if (pm->pn_msg_id != PN_COMMON_MESSAGE) return 1; submsg_id = (ph->pn_res == PN_PREFIX) ? pm->pn_e_submsg_id : pm->pn_submsg_id; if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP && pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP) return 1; return 0; } static int send_obj_unreachable(struct sk_buff *rskb) { const struct phonethdr *oph = pn_hdr(rskb); const struct phonetmsg *opm = pn_msg(rskb); struct phonetmsg resp; memset(&resp, 0, sizeof(resp)); resp.pn_trans_id = opm->pn_trans_id; resp.pn_msg_id = PN_COMMON_MESSAGE; if (oph->pn_res == PN_PREFIX) { resp.pn_e_res_id = opm->pn_e_res_id; resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_e_orig_msg_id = opm->pn_msg_id; resp.pn_e_status = 0; } else { resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP; resp.pn_orig_msg_id = opm->pn_msg_id; resp.pn_status = 0; } return pn_raw_send(&resp, sizeof(resp), rskb->dev, pn_object(oph->pn_sdev, oph->pn_sobj), pn_object(oph->pn_rdev, oph->pn_robj), oph->pn_res); } static int send_reset_indications(struct sk_buff *rskb) { struct phonethdr *oph = pn_hdr(rskb); static const u8 data[4] = { 0x00 /* trans ID */, 0x10 /* subscribe msg */, 0x00 /* subscription count */, 0x00 /* dummy */ }; return pn_raw_send(data, sizeof(data), rskb->dev, pn_object(oph->pn_sdev, 0x00), pn_object(oph->pn_rdev, oph->pn_robj), PN_COMMGR); } /* packet type functions */ /* * Stuff received packets to associated sockets. * On error, returns non-zero and releases the skb. */ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct net *net = dev_net(dev); struct phonethdr *ph; struct sockaddr_pn sa; u16 len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; /* check we have at least a full Phonet header */ if (!pskb_pull(skb, sizeof(struct phonethdr))) goto out; /* check that the advertised length is correct */ ph = pn_hdr(skb); len = get_unaligned_be16(&ph->pn_length); if (len < 2) goto out; len -= 2; if ((len > skb->len) || pskb_trim(skb, len)) goto out; skb_reset_transport_header(skb); pn_skb_get_dst_sockaddr(skb, &sa); /* check if this is broadcasted */ if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) { pn_deliver_sock_broadcast(net, skb); goto out; } /* resource routing */ if (pn_sockaddr_get_object(&sa) == 0) { struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource); if (sk) return sk_receive_skb(sk, skb, 0); } /* check if we are the destination */ if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { /* Phonet packet input */ struct sock *sk = pn_find_sock_by_sa(net, &sa); if (sk) return sk_receive_skb(sk, skb, 0); if (can_respond(skb)) { send_obj_unreachable(skb); send_reset_indications(skb); } } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) goto out; /* Race between address deletion and loopback */ else { /* Phonet packet routing */ struct net_device *out_dev; out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa)); if (!out_dev) { net_dbg_ratelimited("No Phonet route to %02X\n", pn_sockaddr_get_addr(&sa)); goto out; } __skb_push(skb, sizeof(struct phonethdr)); skb->dev = out_dev; if (out_dev == dev) { net_dbg_ratelimited("Phonet loop to %02X on %s\n", pn_sockaddr_get_addr(&sa), dev->name); goto out_dev; } /* Some drivers (e.g. TUN) do not allocate HW header space */ if (skb_cow_head(skb, out_dev->hard_header_len)) goto out_dev; if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL, skb->len) < 0) goto out_dev; dev_queue_xmit(skb); dev_put(out_dev); return NET_RX_SUCCESS; out_dev: dev_put(out_dev); } out: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type phonet_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_PHONET), .func = phonet_rcv, }; static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp) { int err = 0; if (protocol >= PHONET_NPROTO) return -EINVAL; err = proto_register(pp->prot, 1); if (err) return err; mutex_lock(&proto_tab_lock); if (proto_tab[protocol]) err = -EBUSY; else rcu_assign_pointer(proto_tab[protocol], pp); mutex_unlock(&proto_tab_lock); return err; } EXPORT_SYMBOL(phonet_proto_register); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(pp->prot); } EXPORT_SYMBOL(phonet_proto_unregister); /* Module registration */ static int __init phonet_init(void) { int err; err = phonet_device_init(); if (err) return err; pn_sock_init(); err = sock_register(&phonet_proto_family); if (err) { printk(KERN_ALERT "phonet protocol family initialization failed\n"); goto err_sock; } dev_add_pack(&phonet_packet_type); phonet_sysctl_init(); err = isi_register(); if (err) goto err; return 0; err: phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); err_sock: phonet_device_exit(); return err; } static void __exit phonet_exit(void) { isi_unregister(); phonet_sysctl_exit(); sock_unregister(PF_PHONET); dev_remove_pack(&phonet_packet_type); phonet_device_exit(); } module_init(phonet_init); module_exit(phonet_exit); MODULE_DESCRIPTION("Phonet protocol stack for Linux"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PHONET); |
7 8 81 45 60 70 9 12 33 4 236 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _NET_ETHTOOL_NETLINK_H #define _NET_ETHTOOL_NETLINK_H #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/sock.h> struct ethnl_req_info; int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, bool require_dev); int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype); struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Return: total length of an attribute with null terminated string from @s */ static inline int ethnl_strz_size(const char *s) { return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); } /** * ethnl_put_strz() - put string attribute with fixed size string * @skb: skb with the message * @attrtype: attribute type * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * * Return: 0 on success, negative error code on failure */ static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, const char *s) { unsigned int len = strnlen(s, ETH_GSTRING_LEN); struct nlattr *attr; attr = nla_reserve(skb, attrtype, len + 1); if (!attr) return -EMSGSIZE; memcpy(nla_data(attr), s, len); ((char *)nla_data(attr))[len] = '\0'; return 0; } /** * ethnl_update_u32() - update u32 value from NLA_U32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u32 value from NLA_U32 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, bool *mod) { u32 val; if (!attr) return; val = nla_get_u32(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_u8() - update u8 value from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u8 value from NLA_U8 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = nla_get_u8(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block * of length @len at @dst by attribute payload; do nothing if @attr is null. * Bool pointed to by @mod is set to true if this function changed the logical * value of *dst, otherwise it is left as is. */ static inline void ethnl_update_binary(void *dst, unsigned int len, const struct nlattr *attr, bool *mod) { if (!attr) return; if (nla_len(attr) < len) len = nla_len(attr); if (!memcmp(dst, nla_data(attr), len)) return; memcpy(dst, nla_data(attr), len); *mod = true; } /** * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Update bits in u32 value which are set in attribute's mask to values from * attribute's value. Do nothing if @attr is null or the value wouldn't change; * otherwise, set bool pointed to by @mod to true. */ static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, bool *mod) { struct nla_bitfield32 change; u32 newval; if (!attr) return; change = nla_get_bitfield32(attr); newval = (*dst & ~change.selector) | (change.value & change.selector); if (*dst == newval) return; *dst = newval; *mod = true; } /** * ethnl_reply_header_size() - total size of reply header * * This is an upper estimate so that we do not need to hold RTNL lock longer * than necessary (to prevent rename between size estimate and composing the * message). Accounts only for device ifindex and name as those are the only * attributes ethnl_fill_reply_header() puts into the reply header. */ static inline unsigned int ethnl_reply_header_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(IFNAMSIZ)); } /* GET request handling */ /* Unified processing of GET requests uses two data structures: request info * and reply data. Request info holds information parsed from client request * and its stays constant through all request processing. Reply data holds data * retrieved from ethtool_ops callbacks or other internal sources which is used * to compose the reply. When processing a dump request, request info is filled * only once (when the request message is parsed) but reply data is filled for * each reply message. * * Both structures consist of part common for all request types (struct * ethnl_req_info and struct ethnl_reply_data defined below) and optional * parts specific for each request type. Common part always starts at offset 0. */ /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types * @phy_index: phy_device index connected to @dev this request is for. Can be * 0 if the request doesn't target a phy, or if the @dev's attached * phy is targeted. * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at * zero offset. */ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; u32 phy_index; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) { netdev_put(req_info->dev, &req_info->dev_tracker); } /** * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, * if any. Must be called under rntl_lock(). * @req_info: The ethnl request to get the phy from. * @header: The netlink header, used for error reporting. * @extack: The netlink extended ACK, for error reporting. * * The caller must hold RTNL, until it's done interacting with the returned * phy_device. * * Return: A phy_device pointer corresponding either to the passed phy_index * if one is provided. If not, the phy_device attached to the * net_device targeted by this request is returned. If there's no * targeted net_device, or no phy_device is attached, NULL is * returned. If the provided phy_index is invalid, an error pointer * is returned. */ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, const struct nlattr *header, struct netlink_ext_ack *extack); /** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to ðnl_req_info.dev; in dumps it's different for each * reply message * * This is a common base for request specific structures holding data for * kernel reply message. These always embed struct ethnl_reply_data at zero * offset. */ struct ethnl_reply_data { struct net_device *dev; }; int ethnl_ops_begin(struct net_device *dev); void ethnl_ops_complete(struct net_device *dev); enum ethnl_sock_type { ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH, }; struct ethnl_sock_priv { struct net_device *dev; u32 portid; enum ethnl_sock_type type; }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type); /** * struct ethnl_request_ops - unified handling of GET and SET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification * @set_ntf_cmd: notification to generate on changes (SET) * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data * (struct ethnl_reply_data) is filled on entry, type specific part after * it is zero initialized. This callback should only modify the type * specific part of reply data. Device identification from struct * ethnl_reply_data is to be used as for dump requests, it iterates * through network devices while dev member of struct ethnl_req_info * points to the device from client request. * @reply_size: * Estimate reply message size. Returned value must be sufficient for * message payload without common reply header. The callback may returned * estimate higher than actual message size if exact calculation would * not be worth the saved memory space. * @fill_reply: * Fill reply message payload (except for common header) from reply data. * The callback must not generate more payload than previously called * ->reply_size() estimated. * @cleanup_data: * Optional cleanup called when reply data is no longer needed. Can be * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. * @set_validate: * Check if set operation is supported for a given device, and perform * extra input checks. Expected return values: * - 0 if the operation is a noop for the device (rare) * - 1 if operation should proceed to calling @set * - negative errno on errors * Called without any locks, just a reference on the netdev. * @set: * Execute the set operation. The implementation should return * - 0 if no configuration has changed * - 1 if configuration changed and notification should be generated * - negative errno on errors * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to ðnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; * ethnl_default_notify() can be used in @ethnl_notify_handlers to send * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; u8 reply_cmd; u16 hdr_attr; unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; u8 set_ntf_cmd; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, const struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); int (*set_validate)(struct ethnl_req_info *req_info, struct genl_info *info); int (*set)(struct ethnl_req_info *req_info, struct genl_info *info); }; /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1]; extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1]; extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1]; extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1]; extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_start(struct netlink_callback *cb); int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_done(struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ |
1212 930 238 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BOOTMEM_INFO_H #define __LINUX_BOOTMEM_INFO_H #include <linux/mm.h> #include <linux/kmemleak.h> /* * Types for free bootmem stored in the low bits of page->private. */ enum bootmem_type { MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1, SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, MIX_SECTION_INFO, NODE_INFO, MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void __init register_page_bootmem_info_node(struct pglist_data *pgdat); void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type); void put_page_bootmem(struct page *page); static inline enum bootmem_type bootmem_type(const struct page *page) { return (unsigned long)page->private & 0xf; } static inline unsigned long bootmem_info(const struct page *page) { return (unsigned long)page->private >> 4; } /* * Any memory allocated via the memblock allocator and not via the * buddy will be marked reserved already in the memmap. For those * pages, we can call this function to free it to buddy allocator. */ static inline void free_bootmem_page(struct page *page) { enum bootmem_type type = bootmem_type(page); /* * The reserve_bootmem_region sets the reserved flag on bootmem * pages. */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); if (type == SECTION_INFO || type == MIX_SECTION_INFO) put_page_bootmem(page); else VM_BUG_ON_PAGE(1, page); } #else static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } static inline void put_page_bootmem(struct page *page) { } static inline enum bootmem_type bootmem_type(const struct page *page) { return SECTION_INFO; } static inline unsigned long bootmem_info(const struct page *page) { return 0; } static inline void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type) { } static inline void free_bootmem_page(struct page *page) { kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } #endif #endif /* __LINUX_BOOTMEM_INFO_H */ |
3 1 5 4 1 1 5 1 1 1 1 4 5 5 5 5 5 2 2 2 3 2 1 1 4 4 2 2 2 2 21 21 19 19 225 225 2 2 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_rbtree { struct rb_root root; rwlock_t lock; seqcount_rwlock_t count; unsigned long last_gc; }; struct nft_rbtree_elem { struct nft_elem_priv priv; struct rb_node node; struct nft_set_ext ext; }; static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) { return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) { return !nft_rbtree_interval_end(rbe); } static int nft_rbtree_cmp(const struct nft_set *set, const struct nft_rbtree_elem *e1, const struct nft_rbtree_elem *e2) { return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext), set->klen); } static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) { return nft_set_elem_expired(&rbe->ext); } static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; int d; parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) return false; rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (interval && !nft_rbtree_cmp(set, rbe, interval) && nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = rcu_dereference_raw(parent->rb_left); continue; } if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) return false; parent = rcu_dereference_raw(parent->rb_left); interval = NULL; continue; } *ext = &rbe->ext; return true; } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; } return false; } INDIRECT_CALLABLE_SCOPE bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); bool ret; ret = __nft_rbtree_lookup(net, set, key, ext, seq); if (ret || !read_seqcount_retry(&priv->count, seq)) return ret; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_lookup(net, set, key, ext, seq); read_unlock_bh(&priv->lock); return ret; } static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, const u32 *key, struct nft_rbtree_elem **elem, unsigned int seq, unsigned int flags, u8 genmask) { struct nft_rbtree_elem *rbe, *interval = NULL; struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent; const void *this; int d; parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) return false; rbe = rb_entry(parent, struct nft_rbtree_elem, node); this = nft_set_ext_key(&rbe->ext); d = memcmp(this, key, set->klen); if (d < 0) { parent = rcu_dereference_raw(parent->rb_left); if (!(flags & NFT_SET_ELEM_INTERVAL_END)) interval = rbe; } else if (d > 0) { parent = rcu_dereference_raw(parent->rb_right); if (flags & NFT_SET_ELEM_INTERVAL_END) interval = rbe; } else { if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = rcu_dereference_raw(parent->rb_left); continue; } if (nft_set_elem_expired(&rbe->ext)) return false; if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == (flags & NFT_SET_ELEM_INTERVAL_END)) { *elem = rbe; return true; } if (nft_rbtree_interval_end(rbe)) interval = NULL; parent = rcu_dereference_raw(parent->rb_left); } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_set_elem_expired(&interval->ext) && ((!nft_rbtree_interval_end(interval) && !(flags & NFT_SET_ELEM_INTERVAL_END)) || (nft_rbtree_interval_end(interval) && (flags & NFT_SET_ELEM_INTERVAL_END)))) { *elem = interval; return true; } return false; } static struct nft_elem_priv * nft_rbtree_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); const u32 *key = (const u32 *)&elem->key.val; u8 genmask = nft_genmask_cur(net); bool ret; ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); if (ret || !read_seqcount_retry(&priv->count, seq)) return &rbe->priv; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); read_unlock_bh(&priv->lock); if (!ret) return ERR_PTR(-ENOENT); return &rbe->priv; } static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { lockdep_assert_held_write(&priv->lock); nft_setelem_data_deactivate(net, set, &rbe->priv); rb_erase(&rbe->node, &priv->root); } static const struct nft_rbtree_elem * nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; struct nft_trans_gc *gc; gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they * are coupled with the interval start element. */ while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev) && nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY)) break; prev = rb_prev(prev); } rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); /* There is always room in this trans gc for this element, * memory allocation never actually happens, hence, the warning * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, * this is synchronous gc which never fails. */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } nft_rbtree_gc_elem_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, struct nft_rbtree_elem *rbe, struct rb_node *first) { struct nft_rbtree_elem *first_elem; first_elem = rb_entry(first, struct nft_rbtree_elem, node); /* this element is closest to where the new element is to be inserted: * update the first element for the node list path. */ if (nft_rbtree_cmp(set, rbe, first_elem) < 0) return true; return false; } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); u64 tstamp = nft_net_tstamp(net); int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the * first element to walk the ordered elements to find possible overlap. */ parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = nft_rbtree_cmp(set, rbe, new); if (d < 0) { p = &parent->rb_left; } else if (d > 0) { if (!first || nft_rbtree_update_first(set, rbe, first)) first = &rbe->node; p = &parent->rb_right; } else { if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; else p = &parent->rb_right; } } if (!first) first = rb_first(&priv->root); /* Detect overlap by going through the list of valid tree nodes. * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ for (node = first; node != NULL; node = next) { next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) continue; /* perform garbage collection to avoid bogus overlap reports * but skip new elements in this transaction. */ if (__nft_set_elem_expired(&rbe->ext, tstamp) && nft_set_elem_active(&rbe->ext, cur_genmask)) { const struct nft_rbtree_elem *removed_end; removed_end = nft_rbtree_gc_elem(set, priv, rbe); if (IS_ERR(removed_end)) return PTR_ERR(removed_end); if (removed_end == rbe_le || removed_end == rbe_ge) return -EAGAIN; continue; } d = nft_rbtree_cmp(set, rbe, new); if (d == 0) { /* Matching end element: no need to look for an * overlapping greater or equal element. */ if (nft_rbtree_interval_end(rbe)) { rbe_le = rbe; break; } /* first element that is greater or equal to key value. */ if (!rbe_ge) { rbe_ge = rbe; continue; } /* this is a closer more or equal element, update it. */ if (nft_rbtree_cmp(set, rbe_ge, new) != 0) { rbe_ge = rbe; continue; } /* element is equal to key value, make sure flags are * the same, an existing more or equal start element * must not be replaced by more or equal end element. */ if ((nft_rbtree_interval_start(new) && nft_rbtree_interval_start(rbe_ge)) || (nft_rbtree_interval_end(new) && nft_rbtree_interval_end(rbe_ge))) { rbe_ge = rbe; continue; } } else if (d > 0) { /* annotate element greater than the new element. */ rbe_ge = rbe; continue; } else if (d < 0) { /* annotate element less than the new element. */ rbe_le = rbe; break; } } /* - new start element matching existing start element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { *elem_priv = &rbe_ge->priv; return -EEXIST; } /* - new end element matching existing end element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { *elem_priv = &rbe_le->priv; return -EEXIST; } /* - new start element with existing closest, less or equal key value * being a start element: partial overlap, reported as -ENOTEMPTY. * Anonymous sets allow for two consecutive start element since they * are constant, skip them to avoid bogus overlap reports. */ if (!nft_set_is_anonymous(set) && rbe_le && nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) return -ENOTEMPTY; /* - new end element with existing closest, less or equal key value * being a end element: partial overlap, reported as -ENOTEMPTY. */ if (rbe_le && nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; /* - new end element with existing closest, greater or equal key value * being an end element: partial overlap, reported as -ENOTEMPTY */ if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; /* Accepted element: pick insertion point depending on key value */ parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { parent = *p; rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = nft_rbtree_cmp(set, rbe, new); if (d < 0) p = &parent->rb_left; else if (d > 0) p = &parent->rb_right; else if (nft_rbtree_interval_end(rbe)) p = &parent->rb_left; else p = &parent->rb_right; } rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; } static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); struct nft_rbtree *priv = nft_set_priv(set); int err; do { if (fatal_signal_pending(current)) return -EINTR; cond_resched(); write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); err = __nft_rbtree_insert(net, set, rbe, elem_priv); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } while (err == -EAGAIN); return err; } static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } static void nft_rbtree_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); struct nft_rbtree *priv = nft_set_priv(set); nft_rbtree_erase(priv, rbe); } static void nft_rbtree_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_clear(net, &rbe->ext); } static void nft_rbtree_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv); nft_set_elem_change_active(net, set, &rbe->ext); } static struct nft_elem_priv * nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); u64 tstamp = nft_net_tstamp(net); int d; while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, set->klen); if (d < 0) parent = parent->rb_left; else if (d > 0) parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) { break; } else if (!nft_set_elem_active(&rbe->ext, genmask)) { parent = parent->rb_left; continue; } nft_rbtree_flush(net, set, &rbe->priv); return &rbe->priv; } } return NULL; } static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); if (iter->count < iter->skip) goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); if (iter->err < 0) { read_unlock_bh(&priv->lock); return; } cont: iter->count++; } read_unlock_bh(&priv->lock); } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { nft_setelem_data_deactivate(net, set, &rbe->priv); nft_rbtree_erase(priv, rbe); } static void nft_rbtree_gc(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct net *net = read_pnet(&set->net); u64 tstamp = nft_net_tstamp(net); struct rb_node *node, *next; struct nft_trans_gc *gc; set = nft_set_container_of(priv); net = read_pnet(&set->net); gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) return; for (node = rb_first(&priv->root); node ; node = next) { next = rb_next(node); rbe = rb_entry(node, struct nft_rbtree_elem, node); /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is * always visited before the start element. */ if (nft_rbtree_interval_end(rbe)) { rbe_end = rbe; continue; } if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; /* end element needs to be removed first, it has * no timeout extension. */ if (rbe_end) { nft_rbtree_gc_remove(net, set, priv, rbe_end); nft_trans_gc_elem_add(gc, rbe_end); rbe_end = NULL; } gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) goto try_later; nft_rbtree_gc_remove(net, set, priv, rbe); nft_trans_gc_elem_add(gc, rbe); } try_later: if (gc) { gc = nft_trans_gc_catchall_sync(gc); nft_trans_gc_queue_sync_done(gc); priv->last_gc = jiffies; } } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_rbtree); } static int nft_rbtree_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_rbtree *priv = nft_set_priv(set); BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); rwlock_init(&priv->lock); seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; return 0; } static void nft_rbtree_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (desc->field_count > 1) return false; if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); else est->size = ~0; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } static void nft_rbtree_commit(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) nft_rbtree_gc(set); } static void nft_rbtree_gc_init(const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); priv->last_gc = jiffies; } /* rbtree stores ranges as singleton elements, each range is composed of two * elements ... */ static u32 nft_rbtree_ksize(u32 size) { return size * 2; } /* ... hide this detail to userspace. */ static u32 nft_rbtree_usize(u32 size) { if (!size) return 0; return size / 2; } static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; const void *key; node = rb_last(&priv->root); if (!node) return 0; rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_rbtree_interval_end(rbe)) return 0; key = nft_set_ext_key(&rbe->ext); if (memchr(key, 1, set->klen)) return 0; /* this is the all-zero no-match element. */ return 1; } const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .privsize = nft_rbtree_privsize, .elemsize = offsetof(struct nft_rbtree_elem, ext), .estimate = nft_rbtree_estimate, .init = nft_rbtree_init, .destroy = nft_rbtree_destroy, .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, .deactivate = nft_rbtree_deactivate, .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .commit = nft_rbtree_commit, .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, .ksize = nft_rbtree_ksize, .usize = nft_rbtree_usize, .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; |
46 45 46 46 15 16 5 2 6 3 41 41 41 1 1 1 1 2 2 1 1 1 1 1 1 1 6 41 40 41 1 1 4 4 4 16 16 16 16 16 16 2 2 41 2 1 1 41 41 41 1 41 186 187 41 41 41 41 41 17 24 40 39 40 42 16 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 61 1 62 61 34 34 32 1 1 36 40 47 47 47 48 48 58 70 70 22 57 56 57 54 2 67 60 61 40 61 61 61 60 1 1 1 1 1 1 1 1 1 1 1 1 1 1 33 33 265 292 33 285 1 282 283 33 283 34 34 275 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 7 7 6 2 2 2 2 1 1 1 14 6 2 4 6 5 4 1 3 23 24 3 6 15 132 54 118 50 50 50 52 52 52 6 4 2 51 51 51 51 2 2 1 1 49 202 77 5 129 41 41 2 4 1 1 1 1 1 5 1 2 2 2 2 2 1 2 46 46 2 2 2 46 42 42 16 42 42 9 8 1 61 1 4 4 1 1 1 1 1 8 8 8 8 2 2 2 2 8 10 2 8 8 11 5 8 8 3 1 2 5 1 4 5 2 3 7 4 3 17 17 116 116 42 13 13 13 115 116 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge multicast support. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/err.h> #include <linux/export.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/kernel.h> #include <linux/log2.h> #include <linux/netdevice.h> #include <linux/netfilter_bridge.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/inetdevice.h> #include <linux/mroute.h> #include <net/ip.h> #include <net/switchdev.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #endif #include <trace/events/bridge.h> #include "br_private.h" #include "br_private_mcast_eht.h" static const struct rhashtable_params br_mdb_rht_params = { .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_mdb_entry, addr), .key_len = sizeof(struct br_ip), .automatic_shrinking = true, }; static const struct rhashtable_params br_sg_port_rht_params = { .head_offset = offsetof(struct net_bridge_port_group, rhnode), .key_offset = offsetof(struct net_bridge_port_group, key), .key_len = sizeof(struct net_bridge_port_group_sg_key), .automatic_shrinking = true, }; static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, struct net_bridge_port_group_sg_key *sg_p) { lockdep_assert_held_once(&br->multicast_lock); return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p, br_sg_port_rht_params); } static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, struct br_ip *dst) { return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); } struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst) { struct net_bridge_mdb_entry *ent; lockdep_assert_held_once(&br->multicast_lock); rcu_read_lock(); ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params); rcu_read_unlock(); return ent; } static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, __be32 dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #if IS_ENABLED(CONFIG_IPV6) static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, const struct in6_addr *dst, __u16 vid) { struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); br_dst.dst.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; return br_mdb_ip_get(br, &br_dst); } #endif struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { struct net_bridge *br = brmctx->br; struct br_ip ip; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) return NULL; memset(&ip, 0, sizeof(ip)); ip.proto = skb->protocol; ip.vid = vid; switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; ip.src.ip4 = 0; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; mdb = br_mdb_ip_get_rcu(br, &ip); if (mdb) return mdb; memset(&ip.src.ip6, 0, sizeof(ip.src.ip6)); } break; #endif default: ip.proto = 0; ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest); } return br_mdb_ip_get_rcu(br, &ip); } /* IMPORTANT: this function must be used only when the contexts cannot be * passed down (e.g. timer) and must be used for read-only purposes because * the vlan snooping option can change, so it can return any context * (non-vlan or vlan). Its initial intended purpose is to read timer values * from the *current* context based on the option. At worst that could lead * to inconsistent timers when the contexts are changed, i.e. src timer * which needs to re-arm with a specific delay taken from the old context */ static struct net_bridge_mcast_port * br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) { struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&pg->key.port->br->multicast_lock); /* if vlan snooping is disabled use the port's multicast context */ if (!pg->key.addr.vid || !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) goto out; /* locking is tricky here, due to different rules for multicast and * vlans we need to take rcu to find the vlan and make sure it has * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under * multicast_lock which must be already held here, so the vlan's pmctx * can safely be used on return */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; else pmctx = NULL; rcu_read_unlock(); out: return pmctx; } static struct net_bridge_mcast_port * br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx = NULL; struct net_bridge_vlan *vlan; lockdep_assert_held_once(&port->br->multicast_lock); if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) return NULL; /* Take RCU to access the vlan. */ rcu_read_lock(); vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid); if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) pmctx = &vlan->port_mcast_ctx; rcu_read_unlock(); return pmctx; } /* when snooping we need to check if the contexts should be used * in the following order: * - if pmctx is non-NULL (port), check if it should be used * - if pmctx is NULL (bridge), check if brmctx should be used */ static bool br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, const struct net_bridge_mcast_port *pmctx) { if (!netif_running(brmctx->br->dev)) return false; if (pmctx) return !br_multicast_port_ctx_state_disabled(pmctx); else return !br_multicast_ctx_vlan_disabled(brmctx); } static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) { if (p->key.port != port) return false; if (!(port->flags & BR_MULTICAST_TO_UNICAST)) return true; return ether_addr_equal(src, p->eth_addr); } static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *src_pg; struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; if (br_sg_port_find(brmctx->br, &sg_key)) return; src_pg = __br_multicast_add_group(brmctx, pmctx, sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) return; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } static void __fwd_del_star_excl(struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; memset(&sg_key, 0, sizeof(sg_key)); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; src_pg = br_sg_port_find(br, &sg_key); if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || src_pg->rt_protocol != RTPROT_KERNEL) return; br_multicast_find_del_pg(br, src_pg); } /* When a port group transitions to (or is added as) EXCLUDE we need to add it * to all other ports' S,G entries which are not blocked by the current group * for proper replication, the assumption is that any S,G blocked entries * are already added so the S,G,port lookup should skip them. * When a port group transitions from EXCLUDE -> INCLUDE mode or is being * deleted we need to remove it from all ports' S,G entries where it was * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). */ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode) { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) return; mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { struct net_bridge_group_src *src_ent; if (pg_lst == pg) continue; hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; switch (filter_mode) { case MCAST_INCLUDE: __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } } } /* called when adding a new S,G with host_joined == false by default */ static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_mdb_entry *sg_mp; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; if (!star_mp->host_joined) return; sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr); if (!sg_mp) return; sg_mp->host_joined = true; } /* set the host_joined state of all of *,G's S,G entries */ static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp) { struct net_bridge *br = star_mp->br; struct net_bridge_mdb_entry *sg_mp; struct net_bridge_port_group *pg; struct br_ip sg_ip; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = star_mp->addr; for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_group_src *src_ent; hlist_for_each_entry(src_ent, &pg->src_list, node) { if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) continue; sg_ip.src = src_ent->addr.src; sg_mp = br_mdb_ip_get(br, &sg_ip); if (!sg_mp) continue; sg_mp->host_joined = star_mp->host_joined; } } } static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; /* *,G exclude ports are only added to S,G entries */ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) return; /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports * we should ignore perm entries since they're managed by user-space */ for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL; pp = &p->next) if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | MDB_PG_FLAGS_PERMANENT))) return; /* currently the host can only have joined the *,G which means * we treat it as EXCLUDE {}, so for an S,G it's considered a * STAR_EXCLUDE entry and we can safely leave it */ sgmp->host_joined = false; for (pp = &sgmp->ports; (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) br_multicast_del_pg(sgmp, p, pp); else pp = &p->next; } } void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) return; br_multicast_sg_host_state(star_mp, sg); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = sg->key.addr; /* we need to add all exclude ports to the S,G */ for (pg = mlock_dereference(star_mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) { struct net_bridge_port_group *src_pg; if (pg == sg || pg->filter_mode == MCAST_INCLUDE) continue; sg_key.port = pg->key.port; if (br_sg_port_find(br, &sg_key)) continue; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) continue; brmctx = br_multicast_port_ctx_get_global(pmctx); src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) continue; src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; } } static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); pmctx = br_multicast_pg_to_port_ctx(src->pg); if (!pmctx) return; brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; /* if it was added by user-space as perm we can skip next steps */ if (sg->rt_protocol != RTPROT_KERNEL && (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; /* the kernel is now responsible for removing this S,G */ del_timer(&sg->timer); star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); if (!star_mp) return; br_multicast_sg_add_exclude_ports(star_mp, sg); } static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, bool fastleave) { struct net_bridge_port_group *p, *pg = src->pg; struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; sg_ip.src = src->addr.src; mp = br_mdb_ip_get(src->br, &sg_ip); if (!mp) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, src->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pg->key.port, pg->eth_addr)) continue; if (p->rt_protocol != RTPROT_KERNEL && (p->flags & MDB_PG_FLAGS_PERMANENT) && !(src->flags & BR_SGRP_F_USER_ADDED)) break; if (fastleave) p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); break; } src->flags &= ~BR_SGRP_F_INSTALLED; } /* install S,G and based on src's timer enable or disable forwarding */ static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) { struct net_bridge_port_group_sg_key sg_key; struct net_bridge_port_group *sg; u8 old_flags; br_multicast_fwd_src_add(src); memset(&sg_key, 0, sizeof(sg_key)); sg_key.addr = src->pg->key.addr; sg_key.addr.src = src->addr.src; sg_key.port = src->pg->key.port; sg = br_sg_port_find(src->br, &sg_key); if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) return; old_flags = sg->flags; if (timer_pending(&src->timer)) sg->flags &= ~MDB_PG_FLAGS_BLOCKED; else sg->flags |= MDB_PG_FLAGS_BLOCKED; if (old_flags != sg->flags) { struct net_bridge_mdb_entry *sg_mp; sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); if (!sg_mp) return; br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); } } static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_mdb_entry *mp; mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc); WARN_ON(!hlist_unhashed(&mp->mdb_node)); WARN_ON(mp->ports); timer_shutdown_sync(&mp->timer); kfree_rcu(mp, rcu); } static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp) { struct net_bridge *br = mp->br; rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); hlist_del_init_rcu(&mp->mdb_node); hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } static void br_multicast_group_expired(struct timer_list *t) { struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) || timer_pending(&mp->timer)) goto out; br_multicast_host_leave(mp, true); if (mp->ports) goto out; br_multicast_del_mdb_entry(mp); out: spin_unlock(&br->multicast_lock); } static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) { struct net_bridge_group_src *src; src = container_of(gc, struct net_bridge_group_src, mcast_gc); WARN_ON(!hlist_unhashed(&src->node)); timer_shutdown_sync(&src->timer); kfree_rcu(src, rcu); } void __br_multicast_del_group_src(struct net_bridge_group_src *src) { struct net_bridge *br = src->pg->key.port->br; hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave) { br_multicast_fwd_src_remove(src, fastleave); __br_multicast_del_group_src(src); } static int br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx, struct netlink_ext_ack *extack, const char *what) { u32 max = READ_ONCE(pmctx->mdb_max_entries); u32 n = READ_ONCE(pmctx->mdb_n_entries); if (max && n >= max) { NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u", what, n, max); return -E2BIG; } WRITE_ONCE(pmctx->mdb_n_entries, n + 1); return 0; } static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx) { u32 n = READ_ONCE(pmctx->mdb_n_entries); WARN_ON_ONCE(n == 0); WRITE_ONCE(pmctx->mdb_n_entries, n - 1); } static int br_multicast_port_ngroups_inc(struct net_bridge_port *port, const struct br_ip *group, struct netlink_ext_ack *extack) { struct net_bridge_mcast_port *pmctx; int err; lockdep_assert_held_once(&port->br->multicast_lock); /* Always count on the port context. */ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack, "Port"); if (err) { trace_br_mdb_full(port->dev, group); return err; } /* Only count on the VLAN context if VID is given, and if snooping on * that VLAN is enabled. */ if (!group->vid) return 0; pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid); if (!pmctx) return 0; err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN"); if (err) { trace_br_mdb_full(port->dev, group); goto dec_one_out; } return 0; dec_one_out: br_multicast_port_ngroups_dec_one(&port->multicast_ctx); return err; } static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid) { struct net_bridge_mcast_port *pmctx; lockdep_assert_held_once(&port->br->multicast_lock); if (vid) { pmctx = br_multicast_port_vid_to_port_ctx(port, vid); if (pmctx) br_multicast_port_ngroups_dec_one(pmctx); } br_multicast_port_ngroups_dec_one(&port->multicast_ctx); } u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_n_entries); } void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max) { WRITE_ONCE(pmctx->mdb_max_entries, max); } u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx) { return READ_ONCE(pmctx->mdb_max_entries); } static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; pg = container_of(gc, struct net_bridge_port_group, mcast_gc); WARN_ON(!hlist_unhashed(&pg->mglist)); WARN_ON(!hlist_empty(&pg->src_list)); timer_shutdown_sync(&pg->rexmit_timer); timer_shutdown_sync(&pg->timer); kfree_rcu(pg, rcu); } void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp) { struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; struct hlist_node *tmp; rcu_assign_pointer(*pp, pg->next); hlist_del_init(&pg->mglist); br_multicast_eht_clean_sets(pg); hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent, false); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); if (!br_multicast_is_star_g(&mp->addr)) { rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode, br_sg_port_rht_params); br_multicast_sg_del_exclude_ports(mp); } else { br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); } br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); if (!mp->ports && !mp->host_joined && netif_running(br->dev)) mod_timer(&mp->timer, jiffies); } static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg) { struct net_bridge_port_group __rcu **pp; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; mp = br_mdb_ip_get(br, &pg->key.addr); if (WARN_ON(!mp)) return; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p != pg) continue; br_multicast_del_pg(mp, pg, pp); return; } WARN_ON(1); } static void br_multicast_port_group_expired(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, timer); struct net_bridge_group_src *src_ent; struct net_bridge *br = pg->key.port->br; struct hlist_node *tmp; bool changed; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || timer_pending(&pg->timer) || hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT) goto out; changed = !!(pg->filter_mode == MCAST_EXCLUDE); pg->filter_mode = MCAST_INCLUDE; hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) { if (!timer_pending(&src_ent->timer)) { br_multicast_del_group_src(src_ent, false); changed = true; } } if (hlist_empty(&pg->src_list)) { br_multicast_find_del_pg(br, pg); } else if (changed) { struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); if (changed && br_multicast_is_star_g(&pg->key.addr)) br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); if (WARN_ON(!mp)) goto out; br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); } out: spin_unlock(&br->multicast_lock); } static void br_multicast_gc(struct hlist_head *head) { struct net_bridge_mcast_gc *gcent; struct hlist_node *tmp; hlist_for_each_entry_safe(gcent, tmp, head, gc_node) { hlist_del_init(&gcent->gc_node); gcent->destroy(gcent); } } static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { struct net_bridge_vlan *vlan = NULL; if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) vlan = pmctx->vlan; else if (br_multicast_ctx_is_vlan(brmctx)) vlan = brmctx->vlan; if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { u16 vlan_proto; if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) return; __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); } } static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, igmp_hdr_size; unsigned long now = jiffies; struct igmpv3_query *ihv3; void *csum_start = NULL; __sum16 *csum = NULL; struct sk_buff *skb; struct igmphdr *ih; struct ethhdr *eth; unsigned long lmqt; struct iphdr *iph; u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { lmqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) lmqt_srcs++; } if (!lmqt_srcs) return NULL; igmp_hdr_size += lmqt_srcs * sizeof(__be32); } } pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); skb_set_network_header(skb, skb->len); iph = ip_hdr(skb); iph->tot_len = htons(pkt_size - sizeof(*eth)); iph->version = 4; iph->ihl = 6; iph->tos = 0xc0; iph->id = 0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ip_send_check(iph); skb_put(skb, 24); skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; ih->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; csum = &ih->csum; csum_start = (void *)ih; break; case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; ihv3->code = (group ? brmctx->multicast_last_member_interval : brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; ihv3->qrv = 2; ihv3->csum = 0; csum = &ihv3->csum; csum_start = (void *)ihv3; if (!pg || !with_srcs) break; lmqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) { ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = ip_compute_csum(csum_start, igmp_hdr_size); skb_put(skb, igmp_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, bool with_srcs, bool over_llqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, mld_hdr_size; unsigned long now = jiffies; struct mld2_query *mld2q; void *csum_start = NULL; unsigned long interval; __sum16 *csum = NULL; struct ipv6hdr *ip6h; struct mld_msg *mldq; struct sk_buff *skb; unsigned long llqt; struct ethhdr *eth; u16 llqt_srcs = 0; u8 *hopopt; mld_hdr_size = sizeof(*mldq); if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { llqt = now + (brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) llqt_srcs++; } if (!llqt_srcs) return NULL; mld_hdr_size += llqt_srcs * sizeof(struct in6_addr); } } pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || pkt_size > brmctx->br->dev->mtu) return NULL; skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); /* IPv6 header + HbH option */ skb_set_network_header(skb, skb->len); ip6h = ipv6_hdr(skb); *(__force __be32 *)ip6h = htonl(0x60000000); ip6h->payload_len = htons(8 + mld_hdr_size); ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); hopopt[0] = IPPROTO_ICMPV6; /* next hdr */ hopopt[1] = 0; /* length of HbH */ hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */ hopopt[3] = 2; /* Length of RA Option */ hopopt[4] = 0; /* Type = 0x0000 (MLD) */ hopopt[5] = 0; hopopt[6] = IPV6_TLV_PAD1; /* Pad1 */ hopopt[7] = IPV6_TLV_PAD1; /* Pad1 */ skb_put(skb, sizeof(*ip6h) + 8); /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? brmctx->multicast_query_response_interval : brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; mldq->mld_code = 0; mldq->mld_cksum = 0; mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval)); mldq->mld_reserved = 0; mldq->mld_mca = *group; csum = &mldq->mld_cksum; csum_start = (void *)mldq; break; case 2: mld2q = (struct mld2_query *)icmp6_hdr(skb); mld2q->mld2q_mrc = htons((u16)jiffies_to_msecs(interval)); mld2q->mld2q_type = ICMPV6_MGM_QUERY; mld2q->mld2q_code = 0; mld2q->mld2q_cksum = 0; mld2q->mld2q_resv1 = 0; mld2q->mld2q_resv2 = 0; mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; if (!pg || !with_srcs) break; llqt_srcs = 0; hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) { mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; } } if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) { kfree_skb(skb); return NULL; } break; } if (WARN_ON(!csum || !csum_start)) { kfree_skb(skb); return NULL; } *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size, IPPROTO_ICMPV6, csum_partial(csum_start, mld_hdr_size, 0)); skb_put(skb, mld_hdr_size); __skb_pull(skb, sizeof(*eth)); out: return skb; } #endif static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, bool over_lmqt, u8 sflag, u8 *igmp_type, bool *need_rexmit) { __be32 ip4_dst; switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct in6_addr ip6_dst; if (ip_dst) ip6_dst = ip_dst->dst.ip6; else ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); } #endif } return NULL; } struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, struct br_ip *group) { struct net_bridge_mdb_entry *mp; int err; mp = br_mdb_ip_get(br, group); if (mp) return mp; if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) { trace_br_mdb_full(br->dev, group); br_mc_disabled_update(br->dev, false, NULL); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false); return ERR_PTR(-E2BIG); } mp = kzalloc(sizeof(*mp), GFP_ATOMIC); if (unlikely(!mp)) return ERR_PTR(-ENOMEM); mp->br = br; mp->addr = *group; mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry; timer_setup(&mp->timer, br_multicast_group_expired, 0); err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode, br_mdb_rht_params); if (err) { kfree(mp); mp = ERR_PTR(err); } else { hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list); } return mp; } static void br_multicast_group_src_expired(struct timer_list *t) { struct net_bridge_group_src *src = from_timer(src, t, timer); struct net_bridge_port_group *pg; struct net_bridge *br = src->br; spin_lock(&br->multicast_lock); if (hlist_unhashed(&src->node) || !netif_running(br->dev) || timer_pending(&src->timer)) goto out; pg = src->pg; if (pg->filter_mode == MCAST_INCLUDE) { br_multicast_del_group_src(src, false); if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); } else { br_multicast_fwd_src_handle(src); } out: spin_unlock(&br->multicast_lock); } struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) { struct net_bridge_group_src *ent; switch (ip->proto) { case htons(ETH_P_IP): hlist_for_each_entry(ent, &pg->src_list, node) if (ip->src.ip4 == ent->addr.src.ip4) return ent; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): hlist_for_each_entry(ent, &pg->src_list, node) if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6)) return ent; break; #endif } return NULL; } struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) { struct net_bridge_group_src *grp_src; if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT)) return NULL; switch (src_ip->proto) { case htons(ETH_P_IP): if (ipv4_is_zeronet(src_ip->src.ip4) || ipv4_is_multicast(src_ip->src.ip4)) return NULL; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (ipv6_addr_any(&src_ip->src.ip6) || ipv6_addr_is_multicast(&src_ip->src.ip6)) return NULL; break; #endif } grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC); if (unlikely(!grp_src)) return NULL; grp_src->pg = pg; grp_src->br = pg->key.port->br; grp_src->addr = *src_ip; grp_src->mcast_gc.destroy = br_multicast_destroy_group_src; timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0); hlist_add_head_rcu(&grp_src->node, &pg->src_list); pg->src_ents++; return grp_src; } struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol, struct netlink_ext_ack *extack) { struct net_bridge_port_group *p; int err; err = br_multicast_port_ngroups_inc(port, group, extack); if (err) return NULL; p = kzalloc(sizeof(*p), GFP_ATOMIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); goto dec_out; } p->key.addr = *group; p->key.port = port; p->flags = flags; p->filter_mode = filter_mode; p->rt_protocol = rt_protocol; p->eht_host_tree = RB_ROOT; p->eht_set_tree = RB_ROOT; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); if (!br_multicast_is_star_g(group) && rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group"); goto free_out; } rcu_assign_pointer(p->next, next); timer_setup(&p->timer, br_multicast_port_group_expired, 0); timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0); hlist_add_head(&p->mglist, &port->mglist); if (src) memcpy(p->eth_addr, src, ETH_ALEN); else eth_broadcast_addr(p->eth_addr); return p; free_out: kfree(p); dec_out: br_multicast_port_ngroups_dec(port, group->vid); return NULL; } void br_multicast_del_port_group(struct net_bridge_port_group *p) { struct net_bridge_port *port = p->key.port; __u16 vid = p->key.addr.vid; hlist_del_init(&p->mglist); if (!br_multicast_is_star_g(&p->key.addr)) rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode, br_sg_port_rht_params); kfree(p); br_multicast_port_ngroups_dec(port, vid); } void br_multicast_host_join(const struct net_bridge_mcast *brmctx, struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } if (br_group_is_l2(&mp->addr)) return; mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) return; mp->host_joined = false; if (br_multicast_is_star_g(&mp->addr)) br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB); } static struct net_bridge_port_group * __br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1, bool blocked) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p = NULL; struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); if (!pmctx) { br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (br_port_group_equal(p, pmctx->port, src)) goto found; if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL, NULL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); goto out; } rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); out: return p; } static int br_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1) { struct net_bridge_port_group *pg; int err; spin_lock(&brmctx->br->multicast_lock); pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); spin_unlock(&brmctx->br->multicast_lock); return err; } static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, bool igmpv2) { struct br_ip br_group; u8 filter_mode; if (ipv4_is_local_multicast(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, bool mldv1) { struct br_ip br_group; u8 filter_mode; if (ipv6_addr_is_ll_all_nodes(group)) return 0; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; return br_multicast_add_group(brmctx, pmctx, &br_group, src, filter_mode, mldv1); } #endif static bool br_multicast_rport_del(struct hlist_node *rlist) { if (hlist_unhashed(rlist)) return false; hlist_del_init_rcu(rlist); return true; } static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { return br_multicast_rport_del(&pmctx->ip4_rlist); } static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) return br_multicast_rport_del(&pmctx->ip6_rlist); #else return false; #endif } static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, struct timer_list *t, struct hlist_node *rlist) { struct net_bridge *br = pmctx->port->br; bool del; spin_lock(&br->multicast_lock); if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(t)) goto out; del = br_multicast_rport_del(rlist); br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip4_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip6_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } #endif static void br_mc_router_state_change(struct net_bridge *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, struct timer_list *timer) { spin_lock(&brmctx->br->multicast_lock); if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || brmctx->multicast_router == MDB_RTR_TYPE_PERM || br_ip4_multicast_is_router(brmctx) || br_ip6_multicast_is_router(brmctx)) goto out; br_mc_router_state_change(brmctx->br, false); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } #endif static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (!netif_running(brmctx->br->dev) || br_multicast_ctx_vlan_global_disabled(brmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; br_multicast_start_querier(brmctx, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif static void br_multicast_query_delay_expired(struct timer_list *t) { } static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, bool with_srcs, u8 sflag, bool *need_rexmit) { bool over_lmqt = !!sflag; struct sk_buff *skb; u8 igmp_type; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_multicast_ctx_matches_vlan_snooping(brmctx)) return; again_under_lmqt: skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; if (pmctx) { skb->dev = pmctx->port->dev; br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { over_lmqt = false; goto again_under_lmqt; } } else { br_multicast_select_own_querier(brmctx, group, skb); br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } static void br_multicast_read_querier(const struct bridge_mcast_querier *querier, struct bridge_mcast_querier *dest) { unsigned int seq; memset(dest, 0, sizeof(*dest)); do { seq = read_seqcount_begin(&querier->seq); dest->port_ifidx = querier->port_ifidx; memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip)); } while (read_seqcount_retry(&querier->seq, seq)); } static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_querier *querier, int ifindex, struct br_ip *saddr) { write_seqcount_begin(&querier->seq); querier->port_ifidx = ifindex; memcpy(&querier->addr, saddr, sizeof(*saddr)); write_seqcount_end(&querier->seq); } static void br_multicast_send_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; struct bridge_mcast_querier *querier; struct br_ip br_group; unsigned long time; if (!br_multicast_ctx_should_use(brmctx, pmctx) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || !brmctx->multicast_querier) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); if (pmctx ? (own_query == &pmctx->ip4_own_query) : (own_query == &brmctx->ip4_own_query)) { querier = &brmctx->ip4_querier; other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { querier = &brmctx->ip6_querier; other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } if (!other_query || timer_pending(&other_query->timer)) return; /* we're about to select ourselves as querier */ if (!pmctx && querier->port_ifidx) { struct br_ip zeroip = {}; br_multicast_update_querier(brmctx, querier, 0, &zeroip); } __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, 0, NULL); time = jiffies; time += own_query->startup_sent < brmctx->multicast_startup_query_count ? brmctx->multicast_startup_query_interval : brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip4_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, ip6_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif static void br_multicast_port_group_rexmit(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) goto out; pmctx = br_multicast_pg_to_port_ctx(pg); if (!pmctx) goto out; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!brmctx->multicast_querier) goto out; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) goto out; if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } static int br_mc_disabled_update(struct net_device *dev, bool value, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, .flags = SWITCHDEV_F_DEFER, .u.mc_disabled = !value, }; return switchdev_port_attr_set(dev, &attr, extack); } void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx) { pmctx->port = port; pmctx->vlan = vlan; pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; timer_setup(&pmctx->ip4_mc_router_timer, br_ip4_multicast_router_expired, 0); timer_setup(&pmctx->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&pmctx->ip6_mc_router_timer, br_ip6_multicast_router_expired, 0); timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif } void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) del_timer_sync(&pmctx->ip6_mc_router_timer); #endif del_timer_sync(&pmctx->ip4_mc_router_timer); } int br_multicast_add_port(struct net_bridge_port *port) { int err; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), NULL); if (err && err != -EOPNOTSUPP) return err; port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) return -ENOMEM; return 0; } void br_multicast_del_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; /* Take care of the remaining groups, only perm ones should be left */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) br_multicast_find_del_pg(br, pg); spin_unlock_bh(&br->multicast_lock); flush_work(&br->mcast_gc_work); br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (try_to_del_timer_sync(&query->timer) >= 0 || del_timer(&query->timer)) mod_timer(&query->timer, jiffies); } static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge *br = pmctx->port->br; struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(pmctx); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) return; br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) br_multicast_enable(&pmctx->ip6_own_query); #endif if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { br_ip4_multicast_add_router(brmctx, pmctx); br_ip6_multicast_add_router(brmctx, pmctx); } if (br_multicast_port_ctx_is_vlan(pmctx)) { struct net_bridge_port_group *pg; u32 n = 0; /* The mcast_n_groups counter might be wrong. First, * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries * are flushed, thus mcast_n_groups after the toggle does not * reflect the true values. And second, permanent entries added * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected * either. Thus we have to refresh the counter. */ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) { if (pg->key.addr.vid == pmctx->vlan->vid) n++; } WRITE_ONCE(pmctx->mdb_n_entries, n); } } void br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; spin_lock_bh(&br->multicast_lock); __br_multicast_enable_port_ctx(&port->multicast_ctx); spin_unlock_bh(&br->multicast_lock); } static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { struct net_bridge_port_group *pg; struct hlist_node *n; bool del = false; hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && (!br_multicast_port_ctx_is_vlan(pmctx) || pg->key.addr.vid == pmctx->vlan->vid)) br_multicast_find_del_pg(pmctx->port->br, pg); del |= br_ip4_multicast_rport_del(pmctx); del_timer(&pmctx->ip4_mc_router_timer); del_timer(&pmctx->ip4_own_query.timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); del_timer(&pmctx->ip6_own_query.timer); #endif br_multicast_rport_del_notify(pmctx, del); } void br_multicast_disable_port(struct net_bridge_port *port) { spin_lock_bh(&port->br->multicast_lock); __br_multicast_disable_port_ctx(&port->multicast_ctx); spin_unlock_bh(&port->br->multicast_lock); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) { struct net_bridge_group_src *ent; struct hlist_node *tmp; int deleted = 0; hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) if (ent->flags & BR_SGRP_F_DELETE) { br_multicast_del_group_src(ent, false); deleted++; } return deleted; } static void __grp_src_mod_timer(struct net_bridge_group_src *src, unsigned long expires) { mod_timer(&src->timer, expires); br_multicast_fwd_src_handle(src); } static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; __grp_src_mod_timer(ent, lmqt); } } } if (!brmctx->multicast_querier || !other_query || timer_pending(&other_query->timer)) return; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; unsigned long now = jiffies, lmi; if (!netif_running(brmctx->br->dev) || !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else other_query = &brmctx->ip6_other_query; #endif if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) { lmi = now + brmctx->multicast_last_member_interval; pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; return changed; } /* State Msg type New state Actions * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Group Timer=GMI */ static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) ent->flags &= ~BR_SGRP_F_DELETE; else ent = br_multicast_new_group_src(pg, &src_ip); if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI * Delete (X-A) * Delete (Y-A) * Group Timer=GMI */ static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; u32 src_idx; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); changed = true; } } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; return changed; } static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI * Send Q(G,X-A) * Send Q(G) */ static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) if (timer_pending(&ent->timer)) ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { if (timer_pending(&ent->timer)) { ent->flags &= ~BR_SGRP_F_SEND; to_send--; } } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) changed = true; } if (ent) __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toin(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if (br_multicast_eht_should_del_pg(pg)) { pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } /* State Msg type New state Actions * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0 * Delete (A-B) * Send Q(G,A*B) * Group Timer=GMI */ static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | BR_SGRP_F_SEND; to_send++; } else { ent = br_multicast_new_group_src(pg, &src_ip); } if (ent) br_multicast_fwd_src_handle(ent); } br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); __grp_src_delete_marked(pg); if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer * Delete (X-A) * Delete (Y-A) * Send Q(G,A-Y) * Group Timer=GMI */ static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_toex(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } /* State Msg type New state Actions * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; u32 src_idx, to_send = 0; bool changed = false; struct br_ip src_ip; hlist_for_each_entry(ent, &pg->src_list, node) ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } if (ent && timer_pending(&ent->timer)) { ent->flags |= BR_SGRP_F_SEND; to_send++; } } if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type)) changed = true; if (to_send) __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } static bool br_multicast_block(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); break; } if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) || br_multicast_eht_should_del_pg(pg)) { if (br_multicast_eht_should_del_pg(pg)) pg->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_find_del_pg(pg->key.port->br, pg); /* a notification has already been sent and we shouldn't * access pg after the delete so we have to return false */ changed = false; } return changed; } static struct net_bridge_port_group * br_multicast_find_port(struct net_bridge_mdb_entry *mp, struct net_bridge_port *p, const unsigned char *src) { struct net_bridge *br __maybe_unused = mp->br; struct net_bridge_port_group *pg; for (pg = mlock_dereference(mp->ports, br); pg; pg = mlock_dereference(pg->next, br)) if (br_port_group_equal(pg, p, src)) return pg; return NULL; } static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; struct igmpv3_report *ih; struct igmpv3_grec *grec; int i, len, num, type; __be32 group, *h_addr; bool changed = false; int err = 0; u16 nsrcs; ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); if (!ip_mc_may_pull(skb, len)) return -EINVAL; grec = (void *)(skb->data + len - sizeof(*grec)); group = grec->grec_mca; type = grec->grec_type; nsrcs = ntohs(grec->grec_nsrcs); len += nsrcs * 4; if (!ip_mc_may_pull(skb, len)) return -EINVAL; switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: case IGMPV3_CHANGE_TO_INCLUDE: case IGMPV3_CHANGE_TO_EXCLUDE: case IGMPV3_ALLOW_NEW_SOURCES: case IGMPV3_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { if (!pmctx || igmpv2) { br_ip4_multicast_leave_group(brmctx, pmctx, group, vid, src); continue; } } else { err = br_ip4_multicast_add_group(brmctx, pmctx, group, vid, src, igmpv2); if (err) break; } if (!pmctx || igmpv2) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4)); h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; struct mld2_report *mld2r; const unsigned char *src; struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; bool changed = false; int i, len, num; int err = 0; if (!ipv6_mc_may_pull(skb, sizeof(*mld2r))) return -EINVAL; mld2r = (struct mld2_report *)icmp6_hdr(skb); num = ntohs(mld2r->mld2r_ngrec); len = skb_transport_offset(skb) + sizeof(*mld2r); for (i = 0; i < num; i++) { __be16 *_nsrcs, __nsrcs; u16 nsrcs; nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); if (skb_transport_offset(skb) + ipv6_transport_len(skb) < nsrcs_offset + sizeof(__nsrcs)) return -EINVAL; _nsrcs = skb_header_pointer(skb, nsrcs_offset, sizeof(__nsrcs), &__nsrcs); if (!_nsrcs) return -EINVAL; nsrcs = ntohs(*_nsrcs); grec_len = struct_size(grec, grec_src, nsrcs); if (!ipv6_mc_may_pull(skb, len + grec_len)) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); len += grec_len; switch (grec->grec_type) { case MLD2_MODE_IS_INCLUDE: case MLD2_MODE_IS_EXCLUDE: case MLD2_CHANGE_TO_INCLUDE: case MLD2_CHANGE_TO_EXCLUDE: case MLD2_ALLOW_NEW_SOURCES: case MLD2_BLOCK_OLD_SOURCES: break; default: continue; } src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { if (!pmctx || mldv1) { br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } if (!pmctx || mldv1) continue; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: spin_unlock(&brmctx->br->multicast_lock); } return err; } #endif static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *saddr) { int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0; struct timer_list *own_timer, *other_timer; struct bridge_mcast_querier *querier; switch (saddr->proto) { case htons(ETH_P_IP): querier = &brmctx->ip4_querier; own_timer = &brmctx->ip4_own_query.timer; other_timer = &brmctx->ip4_other_query.timer; if (!querier->addr.src.ip4 || ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4)) goto update; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): querier = &brmctx->ip6_querier; own_timer = &brmctx->ip6_own_query.timer; other_timer = &brmctx->ip6_other_query.timer; if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0) goto update; break; #endif default: return false; } if (!timer_pending(own_timer) && !timer_pending(other_timer)) goto update; return false; update: br_multicast_update_querier(brmctx, querier, port_ifidx, saddr); return true; } static struct net_bridge_port * __br_multicast_get_querier_port(struct net_bridge *br, const struct bridge_mcast_querier *querier) { int port_ifidx = READ_ONCE(querier->port_ifidx); struct net_bridge_port *p; struct net_device *dev; if (port_ifidx == 0) return NULL; dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx); if (!dev) return NULL; p = br_port_get_rtnl_rcu(dev); if (!p || p->br != br) return NULL; return p; } size_t br_multicast_querier_state_size(void) { return nla_total_size(0) + /* nest attribute */ nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */ #if IS_ENABLED(CONFIG_IPV6) nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */ #endif 0; } /* protected by rtnl or rcu */ int br_multicast_dump_querier_state(struct sk_buff *skb, const struct net_bridge_mcast *brmctx, int nest_attr) { struct bridge_mcast_querier querier = {}; struct net_bridge_port *p; struct nlattr *nest; if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || br_multicast_ctx_vlan_global_disabled(brmctx)) return 0; nest = nla_nest_start(skb, nest_attr); if (!nest) return -EMSGSIZE; rcu_read_lock(); if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip4_other_query.timer)) goto out_v6; br_multicast_read_querier(&brmctx->ip4_querier, &querier); if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS, querier.addr.src.ip4)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip4_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER, br_timer_value(&brmctx->ip4_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out_v6: #if IS_ENABLED(CONFIG_IPV6) if (!brmctx->multicast_querier && !timer_pending(&brmctx->ip6_other_query.timer)) goto out; br_multicast_read_querier(&brmctx->ip6_querier, &querier); if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS, &querier.addr.src.ip6)) { rcu_read_unlock(); goto out_err; } p = __br_multicast_get_querier_port(brmctx->br, &querier); if (timer_pending(&brmctx->ip6_other_query.timer) && (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER, br_timer_value(&brmctx->ip6_other_query.timer), BRIDGE_QUERIER_PAD) || (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT, p->dev->ifindex)))) { rcu_read_unlock(); goto out_err; } out: #endif rcu_read_unlock(); nla_nest_end(skb, nest); if (!nla_len(nest)) nla_nest_cancel(skb, nest); return 0; out_err: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) mod_timer(&query->delay_timer, jiffies + max_delay); mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, bool is_mc_router) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_MROUTER, .flags = SWITCHDEV_F_DEFER, .u.mrouter = is_mc_router, }; switchdev_port_attr_set(p->dev, &attr, NULL); } static struct net_bridge_port * br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, struct hlist_head *mc_router_list, struct hlist_node *rlist) { struct net_bridge_mcast_port *pmctx; #if IS_ENABLED(CONFIG_IPV6) if (mc_router_list == &brmctx->ip6_mc_router_list) pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip6_rlist); else #endif pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, ip4_rlist); return pmctx->port; } static struct hlist_node * br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, struct net_bridge_port *port, struct hlist_head *mc_router_list) { struct hlist_node *slot = NULL; struct net_bridge_port *p; struct hlist_node *rlist; hlist_for_each(rlist, mc_router_list) { p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); if ((unsigned long)port >= (unsigned long)p) break; slot = rlist; } return slot; } static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, struct hlist_node *rnode) { #if IS_ENABLED(CONFIG_IPV6) if (rnode != &pmctx->ip6_rlist) return hlist_unhashed(&pmctx->ip6_rlist); else return hlist_unhashed(&pmctx->ip4_rlist); #else return true; #endif } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct hlist_node *rlist, struct hlist_head *mc_router_list) { struct hlist_node *slot; if (!hlist_unhashed(rlist)) return; slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) hlist_add_behind_rcu(rlist, slot); else hlist_add_head_rcu(rlist, mc_router_list); /* For backwards compatibility for now, only notify if we * switched from no IPv4/IPv6 multicast router to a new * IPv4 or IPv6 multicast router. */ if (br_multicast_no_router_otherpf(pmctx, rlist)) { br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); br_port_mc_router_state_change(pmctx->port, true); } } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, &brmctx->ip4_mc_router_list); } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, &brmctx->ip6_mc_router_list); #endif } static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct timer_list *timer, struct hlist_node *rlist, struct hlist_head *mc_router_list) { unsigned long now = jiffies; if (!br_multicast_ctx_should_use(brmctx, pmctx)) return; if (!pmctx) { if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { if (!br_ip4_multicast_is_router(brmctx) && !br_ip6_multicast_is_router(brmctx)) br_mc_router_state_change(brmctx->br, true); mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); mod_timer(timer, now + brmctx->multicast_querier_interval); } static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { struct timer_list *timer = &brmctx->ip4_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip4_mc_router_timer; rlist = &pmctx->ip4_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip4_mc_router_list); } static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) struct timer_list *timer = &brmctx->ip6_mc_router_timer; struct hlist_node *rlist = NULL; if (pmctx) { timer = &pmctx->ip6_mc_router_timer; rlist = &pmctx->ip6_rlist; } br_multicast_mark_router(brmctx, pmctx, timer, rlist, &brmctx->ip6_mc_router_list); #endif } static void br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip4_multicast_mark_router(brmctx, pmctx); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; br_multicast_update_query_timer(brmctx, query, max_delay); br_ip6_multicast_mark_router(brmctx, pmctx); } #endif static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ip_transport_len(skb); const struct iphdr *iph = ip_hdr(skb); struct igmphdr *ih = igmp_hdr(skb); struct net_bridge_mdb_entry *mp; struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; __be32 group; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; if (transport_len == sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || (brmctx->multicast_igmp_version == 3 && group && ih3->suppress)) goto out; max_delay = ih3->code ? IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1; } else { goto out; } if (!group) { saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; br_ip4_multicast_query_received(brmctx, pmctx, &brmctx->ip4_other_query, &saddr, max_delay); goto out; } mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { unsigned int transport_len = ipv6_transport_len(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); const struct in6_addr *group = NULL; bool is_general_query; int err = 0; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; } mld = (struct mld_msg *) icmp6_hdr(skb); max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay)); if (max_delay) group = &mld->mld_mca; } else { if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) { err = -EINVAL; goto out; } mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL); } is_general_query = group && ipv6_addr_any(group); if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; br_ip6_multicast_query_received(brmctx, pmctx, &brmctx->ip6_other_query, &saddr, max_delay); goto out; } else if (!group) { goto out; } mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : try_to_del_timer_sync(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void br_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, const unsigned char *src) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; unsigned long now; unsigned long time; spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) break; p->flags |= MDB_PG_FLAGS_FAST_LEAVE; br_multicast_del_pg(mp, p, pp); } goto out; } if (timer_pending(&other_query->timer)) goto out; if (brmctx->multicast_querier) { __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); time = jiffies + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL && pmctx != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : try_to_del_timer_sync(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } } now = jiffies; time = now + brmctx->multicast_last_member_count * brmctx->multicast_last_member_interval; if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : try_to_del_timer_sync(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } goto out; } for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; p = mlock_dereference(p->next, brmctx->br)) { if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : try_to_del_timer_sync(&p->timer) >= 0)) { mod_timer(&p->timer, time); } break; } out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv4_is_local_multicast(group)) return; own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) { struct br_ip br_group; struct bridge_mcast_own_query *own_query; if (ipv6_addr_is_ll_all_nodes(group)) return; own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; br_multicast_leave_group(brmctx, pmctx, &br_group, &brmctx->ip6_other_query, own_query, src); } #endif static void br_multicast_err_count(const struct net_bridge *br, const struct net_bridge_port *p, __be16 proto) { struct bridge_mcast_stats __percpu *stats; struct bridge_mcast_stats *pstats; if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; pstats = this_cpu_ptr(stats); u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): pstats->mstats.igmp_parse_errors++; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): pstats->mstats.mld_parse_errors++; break; #endif } u64_stats_update_end(&pstats->syncp); } static void br_multicast_pim(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); struct pimhdr *pimhdr, _pimhdr; pimhdr = skb_header_pointer(skb, offset, sizeof(_pimhdr), &_pimhdr); if (!pimhdr || pim_hdr_version(pimhdr) != PIM_VERSION || pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; spin_lock(&brmctx->br->multicast_lock); br_ip4_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); return 0; } static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; err = ip_mc_check_igmp(skb); if (err == -ENOMSG) { if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) { BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } ih = igmp_hdr(skb); src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->igmp = ih->type; switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; spin_lock(&brmctx->br->multicast_lock); br_ip6_multicast_mark_router(brmctx, pmctx); spin_unlock(&brmctx->br->multicast_lock); } static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; err = ipv6_mc_check_mld(skb); if (err == -ENOMSG || err == -ENODATA) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } mld = (struct mld_msg *)skb_transport_header(skb); BR_INPUT_SKB_CB(skb)->igmp = mld->mld_type; switch (mld->mld_type) { case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, vid, src, true); break; case ICMPV6_MLD2_REPORT: err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, src); break; } br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { const struct net_bridge_vlan *masterv; /* the vlan has the master flag set only when transmitting * through the bridge device */ if (br_vlan_is_master(vlan)) { masterv = vlan; *brmctx = &vlan->br_mcast_ctx; *pmctx = NULL; } else { masterv = vlan->brvlan; *brmctx = &vlan->brvlan->br_mcast_ctx; *pmctx = &vlan->port_mcast_ctx; } if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return 0; } switch (skb->protocol) { case htons(ETH_P_IP): ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } return ret; } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query, struct bridge_mcast_querier *querier) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) goto out; if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; br_multicast_send_query(brmctx, NULL, query); out: spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip4_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, &brmctx->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { struct net_bridge_mcast *brmctx = from_timer(brmctx, t, ip6_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, &brmctx->ip6_querier); } #endif static void br_multicast_gc_work(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, mcast_gc_work); HLIST_HEAD(deleted_head); spin_lock_bh(&br->multicast_lock); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); } void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx) { brmctx->br = br; brmctx->vlan = vlan; brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; brmctx->multicast_last_member_count = 2; brmctx->multicast_startup_query_count = 2; brmctx->multicast_last_member_interval = HZ; brmctx->multicast_query_response_interval = 10 * HZ; brmctx->multicast_startup_query_interval = 125 * HZ / 4; brmctx->multicast_query_interval = 125 * HZ; brmctx->multicast_querier_interval = 255 * HZ; brmctx->multicast_membership_interval = 260 * HZ; brmctx->ip4_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) brmctx->multicast_mld_version = 1; brmctx->ip6_querier.port_ifidx = 0; seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif timer_setup(&brmctx->ip4_mc_router_timer, br_ip4_multicast_local_router_expired, 0); timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); timer_setup(&brmctx->ip4_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) timer_setup(&brmctx->ip6_mc_router_timer, br_ip6_multicast_local_router_expired, 0); timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); timer_setup(&brmctx->ip6_other_query.delay_timer, br_multicast_query_delay_expired, 0); timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif } void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) { __br_multicast_stop(brmctx); } void br_multicast_init(struct net_bridge *br) { br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; br_multicast_ctx_init(br, NULL, &br->multicast_ctx); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); } static void br_ip4_multicast_join_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (!in_dev) return; __ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_join_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_inc(br->dev, &addr); } #else static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) { } #endif void br_multicast_join_snoopers(struct net_bridge *br) { br_ip4_multicast_join_snoopers(br); br_ip6_multicast_join_snoopers(br); } static void br_ip4_multicast_leave_snoopers(struct net_bridge *br) { struct in_device *in_dev = in_dev_get(br->dev); if (WARN_ON(!in_dev)) return; __ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC); in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); ipv6_dev_mc_dec(br->dev, &addr); } #else static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) { } #endif void br_multicast_leave_snoopers(struct net_bridge *br) { br_ip4_multicast_leave_snoopers(br); br_ip6_multicast_leave_snoopers(br); } static void __br_multicast_open_query(struct net_bridge *br, struct bridge_mcast_own_query *query) { query->startup_sent = 0; if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; mod_timer(&query->timer, jiffies); } static void __br_multicast_open(struct net_bridge_mcast *brmctx) { __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } void br_multicast_open(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_open(&vlan->br_mcast_ctx); } } } else { __br_multicast_open(&br->multicast_ctx); } } static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { del_timer_sync(&brmctx->ip4_mc_router_timer); del_timer_sync(&brmctx->ip4_other_query.timer); del_timer_sync(&brmctx->ip4_other_query.delay_timer); del_timer_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) del_timer_sync(&brmctx->ip6_mc_router_timer); del_timer_sync(&brmctx->ip6_other_query.timer); del_timer_sync(&brmctx->ip6_other_query.delay_timer); del_timer_sync(&brmctx->ip6_own_query.timer); #endif } void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; /* it's okay to check for the flag without the multicast lock because it * can only change under RTNL -> multicast_lock, we need the latter to * sync with timers and packets */ if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) return; if (br_vlan_is_master(vlan)) { br = vlan->br; if (!br_vlan_is_brentry(vlan) || (on && br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) return; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; spin_unlock_bh(&br->multicast_lock); if (on) __br_multicast_open(&vlan->br_mcast_ctx); else __br_multicast_stop(&vlan->br_mcast_ctx); } else { struct net_bridge_mcast *brmctx; brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) return; br = vlan->port->br; spin_lock_bh(&br->multicast_lock); vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; if (on) __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); else __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); spin_unlock_bh(&br->multicast_lock); } } static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge_port *p; if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) return; list_for_each_entry(p, &vlan->br->port_list, list) { struct net_bridge_vlan *vport; vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); if (!vport) continue; br_multicast_toggle_one_vlan(vport, on); } if (br_vlan_is_brentry(vlan)) br_multicast_toggle_one_vlan(vlan, on); } int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; struct net_bridge_port *p; if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) return 0; if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); return -EINVAL; } vg = br_vlan_group(br); if (!vg) return 0; br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); /* disable/enable non-vlan mcast contexts based on vlan snooping */ if (on) __br_multicast_stop(&br->multicast_ctx); else __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) br_multicast_disable_port(p); else br_multicast_enable_port(p); } list_for_each_entry(vlan, &vg->vlan_list, vlist) br_multicast_toggle_vlan(vlan, on); return 0; } bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) { ASSERT_RTNL(); /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and * requires only RTNL to change */ if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) return false; vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; br_multicast_toggle_vlan(vlan, on); return true; } void br_multicast_stop(struct net_bridge *br) { ASSERT_RTNL(); if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; vg = br_vlan_group(br); if (vg) { list_for_each_entry(vlan, &vg->vlan_list, vlist) { struct net_bridge_mcast *brmctx; brmctx = &vlan->br_mcast_ctx; if (br_vlan_is_brentry(vlan) && !br_multicast_ctx_vlan_disabled(brmctx)) __br_multicast_stop(&vlan->br_mcast_ctx); } } } else { __br_multicast_stop(&br->multicast_ctx); } } void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; HLIST_HEAD(deleted_head); struct hlist_node *tmp; spin_lock_bh(&br->multicast_lock); hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node) br_multicast_del_mdb_entry(mp); hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); rcu_barrier(); } int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) { int err = -EINVAL; spin_lock_bh(&brmctx->br->multicast_lock); switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); del_timer(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) del_timer(&brmctx->ip6_mc_router_timer); #endif brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) br_mc_router_state_change(brmctx->br, false); brmctx->multicast_router = val; err = 0; break; } spin_unlock_bh(&brmctx->br->multicast_lock); return err; } static void br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { if (!deleted) return; /* For backwards compatibility for now, only notify if there is * no multicast router anymore for both IPv4 and IPv6. */ if (!hlist_unhashed(&pmctx->ip4_rlist)) return; #if IS_ENABLED(CONFIG_IPV6) if (!hlist_unhashed(&pmctx->ip6_rlist)) return; #endif br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, unsigned long val) { struct net_bridge_mcast *brmctx; unsigned long now = jiffies; int err = -EINVAL; bool del = false; brmctx = br_multicast_port_ctx_get_global(pmctx); spin_lock_bh(&brmctx->br->multicast_lock); if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { mod_timer(&pmctx->ip4_mc_router_timer, now + brmctx->multicast_querier_interval); #if IS_ENABLED(CONFIG_IPV6) mod_timer(&pmctx->ip6_mc_router_timer, now + brmctx->multicast_querier_interval); #endif } err = 0; goto unlock; } switch (val) { case MDB_RTR_TYPE_DISABLED: pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; del |= br_ip4_multicast_rport_del(pmctx); del_timer(&pmctx->ip4_mc_router_timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); #endif br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; del |= br_ip4_multicast_rport_del(pmctx); del |= br_ip6_multicast_rport_del(pmctx); br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: pmctx->multicast_router = MDB_RTR_TYPE_PERM; del_timer(&pmctx->ip4_mc_router_timer); br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) del_timer(&pmctx->ip6_mc_router_timer); #endif br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: pmctx->multicast_router = MDB_RTR_TYPE_TEMP; br_ip4_multicast_mark_router(brmctx, pmctx); br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return err; } int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router) { int err; if (br_vlan_is_master(v)) err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router); else err = br_multicast_set_port_router(&v->port_mcast_ctx, mcast_router); return err; } static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; if (!br_multicast_ctx_matches_vlan_snooping(brmctx)) return; __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { struct bridge_mcast_own_query *ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query *ip6_own_query; #endif if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; if (br_multicast_ctx_is_vlan(brmctx)) { struct net_bridge_vlan *vlan; vlan = br_vlan_find(nbp_vlan_group_rcu(port), brmctx->vlan->vid); if (!vlan || br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) continue; ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; #endif } else { ip4_own_query = &port->multicast_ctx.ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) ip6_own_query = &port->multicast_ctx.ip6_own_query; #endif } if (query == &brmctx->ip4_own_query) br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); } int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_port *port; bool change_snoopers = false; int err = 0; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) goto unlock; err = br_mc_disabled_update(br->dev, val, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto unlock; br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; goto unlock; } if (!netif_running(br->dev)) goto unlock; br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; unlock: spin_unlock_bh(&br->multicast_lock); /* br_multicast_join_snoopers has the potential to cause * an MLD Report/Leave to be delivered to br_multicast_rcv, * which would in turn call br_multicast_add_group, which would * attempt to acquire multicast_lock. This function should be * called after the lock has been released to avoid deadlocks on * multicast_lock. * * br_multicast_leave_snoopers does not have the problem since * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and * returns without calling br_multicast_ipv4/6_rcv if it's not * enabled. Moved both functions out just for symmetry. */ if (change_snoopers) { if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) br_multicast_join_snoopers(br); else br_multicast_leave_snoopers(br); } return err; } bool br_multicast_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return !!br_opt_get(br, BROPT_MULTICAST_ENABLED); } EXPORT_SYMBOL_GPL(br_multicast_enabled); bool br_multicast_router(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); bool is_router; spin_lock_bh(&br->multicast_lock); is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } EXPORT_SYMBOL_GPL(br_multicast_router); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long max_delay; val = !!val; spin_lock_bh(&brmctx->br->multicast_lock); if (brmctx->multicast_querier == val) goto unlock; WRITE_ONCE(brmctx->multicast_querier, val); if (!val) goto unlock; max_delay = brmctx->multicast_query_response_interval; if (!timer_pending(&brmctx->ip4_other_query.timer)) mod_timer(&brmctx->ip4_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) if (!timer_pending(&brmctx->ip6_other_query.timer)) mod_timer(&brmctx->ip6_other_query.delay_timer, jiffies + max_delay); br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support only version 2 and 3 */ switch (val) { case 2: case 3: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_igmp_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #if IS_ENABLED(CONFIG_IPV6) int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, unsigned long val) { /* Currently we support version 1 and 2 */ switch (val) { case 1: case 2: break; default: return -EINVAL; } spin_lock_bh(&brmctx->br->multicast_lock); brmctx->multicast_mld_version = val; spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #endif void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } brmctx->multicast_query_interval = intvl_jiffies; } void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long intvl_jiffies = clock_t_to_jiffies(val); if (intvl_jiffies < BR_MULTICAST_STARTUP_QUERY_INTVL_MIN) { br_info(brmctx->br, "trying to set multicast startup query interval below minimum, setting to %lu (%ums)\n", jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN), jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN)); intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } brmctx->multicast_startup_query_interval = intvl_jiffies; } /** * br_multicast_list_adjacent - Returns snooped multicast addresses * @dev: The bridge port adjacent to which to retrieve addresses * @br_ip_list: The list to store found, snooped multicast IP addresses in * * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast * snooping feature on all bridge ports of dev's bridge device, excluding * the addresses from dev itself. * * Returns the number of items added to br_ip_list. * * Notes: * - br_ip_list needs to be initialized by caller * - br_ip_list might contain duplicates in the end * (needs to be taken care of by caller) * - br_ip_list needs to be freed by caller */ int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) { struct net_bridge *br; struct net_bridge_port *port; struct net_bridge_port_group *group; struct br_ip_list *entry; int count = 0; rcu_read_lock(); if (!br_ip_list || !netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; list_for_each_entry_rcu(port, &br->port_list, list) { if (!port->dev || port->dev == dev) continue; hlist_for_each_entry_rcu(group, &port->mglist, mglist) { entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) goto unlock; entry->addr = group->key.addr; list_add(&entry->list, br_ip_list); count++; } } unlock: rcu_read_unlock(); return count; } EXPORT_SYMBOL_GPL(br_multicast_list_adjacent); /** * br_multicast_has_querier_anywhere - Checks for a querier on a bridge * @dev: The bridge port providing the bridge on which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a valid querier exists anywhere on the bridged link layer. * Otherwise returns false. */ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) { struct net_bridge *br; struct net_bridge_port *port; struct ethhdr eth; bool ret = false; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); ret = br_multicast_querier_exists(&br->multicast_ctx, ð, NULL); unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); /** * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port * @dev: The bridge port adjacent to which to check for a querier * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a selected querier is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; int port_ifidx; rcu_read_lock(); if (!netif_is_bridge_port(dev)) goto unlock; port = br_port_get_rcu(dev); if (!port || !port->br) goto unlock; br = port->br; brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: port_ifidx = brmctx->ip4_querier.port_ifidx; if (!timer_pending(&brmctx->ip4_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: port_ifidx = brmctx->ip6_querier.port_ifidx; if (!timer_pending(&brmctx->ip6_other_query.timer) || port_ifidx == port->dev->ifindex) goto unlock; break; #endif default: goto unlock; } ret = true; unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); /** * br_multicast_has_router_adjacent - Checks for a router behind a bridge port * @dev: The bridge port adjacent to which to check for a multicast router * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6 * * Checks whether the given interface has a bridge on top and if so returns * true if a multicast router is behind one of the other ports of this * bridge. Otherwise returns false. */ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) { struct net_bridge_mcast_port *pmctx; struct net_bridge_mcast *brmctx; struct net_bridge_port *port; bool ret = false; rcu_read_lock(); port = br_port_get_check_rcu(dev); if (!port) goto unlock; brmctx = &port->br->multicast_ctx; switch (proto) { case ETH_P_IP: hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) { if (pmctx->port == port) continue; ret = true; goto unlock; } break; #endif default: /* when compiled without IPv6 support, be conservative and * always assume presence of an IPv6 multicast router */ ret = true; } unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent); static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats *pstats = this_cpu_ptr(stats); __be16 proto = skb->protocol; unsigned int t_len; u64_stats_update_begin(&pstats->syncp); switch (proto) { case htons(ETH_P_IP): t_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb); switch (type) { case IGMP_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v1reports[dir]++; break; case IGMPV2_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v2reports[dir]++; break; case IGMPV3_HOST_MEMBERSHIP_REPORT: pstats->mstats.igmp_v3reports[dir]++; break; case IGMP_HOST_MEMBERSHIP_QUERY: if (t_len != sizeof(struct igmphdr)) { pstats->mstats.igmp_v3queries[dir]++; } else { unsigned int offset = skb_transport_offset(skb); struct igmphdr *ih, _ihdr; ih = skb_header_pointer(skb, offset, sizeof(_ihdr), &_ihdr); if (!ih) break; if (!ih->code) pstats->mstats.igmp_v1queries[dir]++; else pstats->mstats.igmp_v2queries[dir]++; } break; case IGMP_HOST_LEAVE_MESSAGE: pstats->mstats.igmp_leaves[dir]++; break; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): t_len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); t_len -= skb_network_header_len(skb); switch (type) { case ICMPV6_MGM_REPORT: pstats->mstats.mld_v1reports[dir]++; break; case ICMPV6_MLD2_REPORT: pstats->mstats.mld_v2reports[dir]++; break; case ICMPV6_MGM_QUERY: if (t_len != sizeof(struct mld_msg)) pstats->mstats.mld_v2queries[dir]++; else pstats->mstats.mld_v1queries[dir]++; break; case ICMPV6_MGM_REDUCTION: pstats->mstats.mld_leaves[dir]++; break; } break; #endif /* CONFIG_IPV6 */ } u64_stats_update_end(&pstats->syncp); } void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; /* if multicast_disabled is true then igmp type can't be set */ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) return; if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; br_mcast_stats_add(stats, skb, type, dir); } int br_multicast_init_stats(struct net_bridge *br) { br->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!br->mcast_stats) return -ENOMEM; return 0; } void br_multicast_uninit_stats(struct net_bridge *br) { free_percpu(br->mcast_stats); } /* noinline for https://llvm.org/pr45802#c9 */ static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src) { dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX]; dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX]; } void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest) { struct bridge_mcast_stats __percpu *stats; struct br_mcast_stats tdst; int i; memset(dest, 0, sizeof(*dest)); if (p) stats = p->mcast_stats; else stats = br->mcast_stats; if (WARN_ON(!stats)) return; memset(&tdst, 0, sizeof(tdst)); for_each_possible_cpu(i) { struct bridge_mcast_stats *cpu_stats = per_cpu_ptr(stats, i); struct br_mcast_stats temp; unsigned int start; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); mcast_stats_add_dir(tdst.igmp_v3queries, temp.igmp_v3queries); mcast_stats_add_dir(tdst.igmp_leaves, temp.igmp_leaves); mcast_stats_add_dir(tdst.igmp_v1reports, temp.igmp_v1reports); mcast_stats_add_dir(tdst.igmp_v2reports, temp.igmp_v2reports); mcast_stats_add_dir(tdst.igmp_v3reports, temp.igmp_v3reports); tdst.igmp_parse_errors += temp.igmp_parse_errors; mcast_stats_add_dir(tdst.mld_v1queries, temp.mld_v1queries); mcast_stats_add_dir(tdst.mld_v2queries, temp.mld_v2queries); mcast_stats_add_dir(tdst.mld_leaves, temp.mld_leaves); mcast_stats_add_dir(tdst.mld_v1reports, temp.mld_v1reports); mcast_stats_add_dir(tdst.mld_v2reports, temp.mld_v2reports); tdst.mld_parse_errors += temp.mld_parse_errors; } memcpy(dest, &tdst, sizeof(*dest)); } int br_mdb_hash_init(struct net_bridge *br) { int err; err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params); if (err) return err; err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); if (err) { rhashtable_destroy(&br->sg_port_tbl); return err; } return 0; } void br_mdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->sg_port_tbl); rhashtable_destroy(&br->mdb_hash_tbl); } |
132 132 21 21 1 21 21 21 1 20 21 23 23 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strscpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, ¶m, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer); |
19236 19226 19236 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 | // SPDX-License-Identifier: GPL-2.0-only /* * xsave/xrstor support. * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ #include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> #include <asm/cpuid.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> #include <uapi/asm/elf.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace (unused)", "Protection Keys User registers", "PASID state", "Control-flow User registers", "Control-flow Kernel registers (unused)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "AMX Tile config", "AMX Tile data", "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_FP] = X86_FEATURE_FPU, [XFEATURE_SSE] = X86_FEATURE_XMM, [XFEATURE_YMM] = X86_FEATURE_AVX, [XFEATURE_BNDREGS] = X86_FEATURE_MPX, [XFEATURE_BNDCSR] = X86_FEATURE_MPX, [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; u64 xfeatures_print; /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ if (xfeatures_missing) xfeatures_print = xfeatures_missing; else xfeatures_print = xfeatures_needed; xfeature_idx = fls64(xfeatures_print)-1; max_idx = ARRAY_SIZE(xfeature_names)-1; xfeature_idx = min(xfeature_idx, max_idx); *feature_name = xfeature_names[xfeature_idx]; } if (xfeatures_missing) return 0; return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); static bool xfeature_is_aligned64(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } static bool xfeature_is_supervisor(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { unsigned int offs, i; /* * Non-compacted format and legacy features use the cached fixed * offsets. */ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* * Compacted format offsets depend on the actual content of the * compacted xsave area which is determined by the xcomp_bv header * field. */ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; for_each_extended_xfeature(i, xcomp_bv) { if (xfeature_is_aligned64(i)) offs = ALIGN(offs, 64); if (i == xfeature) break; offs += xstate_sizes[i]; } return offs; } /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. */ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures * that any stale state is wiped out from XFD. Reset the per CPU * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } static bool xfeature_enabled(enum xfeature xfeature) { return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. */ static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, i; /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ xstate_offsets[XFEATURE_FP] = 0; xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, xmm_space); xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; xstate_flags[i] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ if (xfeature_is_supervisor(i)) continue; xstate_offsets[i] = ebx; /* * In our xstate size checks, we assume that the highest-numbered * xstate feature has the highest offset in the buffer. Ensure * it does. */ WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; } } static void __init print_xstate_feature(u64 xstate_mask) { const char *feature_name; if (cpu_has_xfeatures(xstate_mask, &feature_name)) pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); } /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { print_xstate_feature(XFEATURE_MASK_FP); print_xstate_feature(XFEATURE_MASK_SSE); print_xstate_feature(XFEATURE_MASK_YMM); print_xstate_feature(XFEATURE_MASK_BNDREGS); print_xstate_feature(XFEATURE_MASK_BNDCSR); print_xstate_feature(XFEATURE_MASK_OPMASK); print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* * This check is important because it is easy to get XSTATE_* * confused with XSTATE_BIT_*. */ #define CHECK_XFEATURE(nr) do { \ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ WARN_ON(nr >= XFEATURE_MAX); \ } while (0) /* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) { int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), i, xstate_sizes[i]); } } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static __init void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch * newly added supported features at build time and make people * actually look at the init state for the new feature. */ #define XFEATURES_INIT_FPSTATE_HANDLED \ (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; print_xstate_features(); xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the * legacy area. Adding new features requires to ensure that init * state is all zeroes or if not to add the necessary handling * here. */ fxsave(&init_fpstate.regs.fxsave); } int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr, struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ if (hdr->xcomp_bv) return -EINVAL; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; return 0; } static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; static int should_dump = 1; if (!should_dump) return; should_dump = 0; /* * Dump out a few leaves past the ones that we support * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } #define XSTATE_WARN_ON(x, fmt, ...) do { \ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) #define XCHECK_SZ(sz, nr, __struct) ({ \ if (WARN_ONCE(sz != sizeof(__struct), \ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ true; \ }) /** * check_xtile_data_against_struct - Check tile data state size. * * Calculate the state size by multiplying the single tile size which is * recorded in a C struct, and the number of tiles that the CPU informs. * Compare the provided size with the calculation. * * @size: The tile data state size * * Returns: 0 on success, -EINVAL on mismatch. */ static int __init check_xtile_data_against_struct(int size) { u32 max_palid, palid, state_size; u32 eax, ebx, ecx, edx; u16 max_tile; /* * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of * supported tiles. */ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { u16 tile_size, max; /* * Check the tile size info: * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; if (tile_size != sizeof(struct xtile_data)) { pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", __stringify(XFEATURE_XTILE_DATA), sizeof(struct xtile_data), tile_size); __xstate_dump_leaves(); return -EINVAL; } if (max > max_tile) max_tile = max; } state_size = sizeof(struct xtile_data) * max_tile; if (size != state_size) { pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", __stringify(XFEATURE_XTILE_DATA), state_size, size); __xstate_dump_leaves(); return -EINVAL; } return 0; } /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); /* * Match each CPU state with the corresponding software * structure. */ switch (nr) { case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; } static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; unsigned int offset = xstate_offsets[topmost]; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); if (compacted) offset = xfeature_get_offset(xfeatures, topmost); return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* * Supervisor state components can be managed only by * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size, "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * * This also takes compaction into account. So this works for * XSAVEC as well. */ static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 1: * EBX enumerates the size (in bytes) required by * the XSAVES instruction for an XSAVE area * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. * * When XSAVES is not available but XSAVEC is (virt), then there * are no supervisor states, but XSAVEC still uses compacted * format. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } /* * Get the total size of the enabled xstates without the independent supervisor * features. */ static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by * the XSAVE instruction for an XSAVE area * containing all the *user* state components * corresponding to bits currently set in XCR0. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and uses compacted * format. XSAVEC uses compacted format, but does not save * supervisor states. * * XSAVE[OPT] do not support supervisor states so kernel and user * size is identical. */ if (compacted) kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; fpu_kernel_cfg.max_size = kernel_size; fpu_user_cfg.max_size = user_size; fpu_kernel_cfg.default_size = kernel_default_size; fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); return 0; } /* * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); /* Restore the legacy size.*/ fpu_kernel_cfg.max_size = legacy_size; fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; /* * Prevent enabling the static branch which enables writes to the * XFD MSR. */ init_fpstate.xfd = 0; fpstate_reset(¤t->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; int err; int i; if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; } if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: x87 FPU will use %s\n", boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } /* * Find user xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { unsigned short cid = xsave_cpuid_features[i]; /* Careful: X86_FEATURE_FPU is 0! */ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; else fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; /* Clean out dynamic features from default */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; fpu_user_cfg.default_features = fpu_user_cfg.max_features; fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; /* * Initialize the default XFD state in initfp_state and enable the * dynamic sizing mechanism if dynamic states are available. The * static key cannot be enabled here because this runs before * jump_label_init(). This is delayed to an initcall. */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Set up compaction feature bit */ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || cpu_feature_enabled(X86_FEATURE_XSAVES)) setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); /* Cache size, offset and flags for initialization */ setup_xstate_cache(); err = init_xstate_size(); if (err) goto out_disable; /* Reset the state for the current task */ fpstate_reset(¤t->thread.fpu); /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); /* * init_fpstate excludes dynamic states as they are large but init * state is zero. */ init_fpstate.size = fpu_kernel_cfg.default_size; init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } setup_init_fpu_buf(); /* * Paranoia check whether something in the setup modified the * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } /* * CPU capabilities initialization runs before FPU init. So * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely * functional, set the feature bit so depending code works. */ setup_force_cpu_cap(X86_FEATURE_OSXSAVE); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ fpu__init_disable_system_xstate(legacy_size); } /* * Restore minimal FPU state after suspend: */ void fpu__resume_cpu(void) { /* * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { u64 xcomp_bv = xsave->header.xcomp_bv; if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } /* * Given the xsave area and a state inside, this function returns the * address of the state. * * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * * Note that if there is no data for the field in the xsave buffer * this will return NULL. * * Inputs: * xstate: the thread's storage area for all FPU data * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? */ if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; /* * We should not ever be requesting features that we * have not enabled. */ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * * This can happen because the last 'xsave' did not * request that this feature be saved (unlikely) * or because the "init optimization" caused it * to not be saved. */ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. * The xsave buffer should be in standard format, not compacted (e.g. user mode * signal frames). */ void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) { if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; return (void __user *)xsave + xstate_offsets[xfeature_nr]; } #ifdef CONFIG_ARCH_HAS_PKEYS /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ old_pkru = read_pkru(); old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); /* Write old part along with new part: */ write_pkru(old_pkru | new_pkru_bits); return 0; } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { membuf_write(to, from_xstate ? xstate : init_xstate, size); } /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; u64 mask; int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ switch (copy_mode) { case XSTATE_COPY_FP: header.xfeatures &= XFEATURE_MASK_FP; break; case XSTATE_COPY_FX: header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; break; case XSTATE_COPY_XSAVE: header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, &xinit->i387, off_mxcsr); /* Copy MXCSR when SSE or YMM are set in the feature mask */ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, MXCSR_AND_FLAGS_SIZE); /* Copy the remaining FP state */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387.st_space, &xinit->i387.st_space, sizeof(xsave->i387.st_space)); /* Copy the SSE state - shared with YMM, but independently managed */ copy_feature(header.xfeatures & XFEATURE_MASK_SSE, &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); if (copy_mode != XSTATE_COPY_XSAVE) goto out; /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); /* Copy the user space relevant state of @xsave->header */ membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); /* * This 'mask' indicates which states to copy from fpstate. * Those extended states that are not present in fpstate are * either disabled or initialized: * * In non-compacted format, disabled features still occupy * state space but there is no state to copy from in the * compacted init_fpstate. The gap tracking will zero these * states. * * The extended features have an all zeroes init state. Thus, * remove them from 'mask' to zero those features in the user * buffer instead of retrieving them from init_fpstate. */ mask = header.xfeatures; for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ if (zerofrom < xstate_offsets[i]) membuf_zero(&to, xstate_offsets[i] - zerofrom); if (i == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the * XSAVE buffer. Use the provided value. */ pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, __raw_xsave_addr(xsave, i), xstate_sizes[i]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ zerofrom = xstate_offsets[i] + xstate_sizes[i]; } out: if (to.left) membuf_zero(&to, to.left); } /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, tsk->thread.fpu.fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { if (kbuf) { memcpy(dst, kbuf + offset, size); } else { if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } return 0; } /** * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate * @fpstate: The fpstate buffer to copy to * @kbuf: The UABI format buffer, if it comes from the kernel * @ubuf: The UABI format buffer, if it comes from userspace * @pkru: The location to write the PKRU value to * * Converts from the UABI format into the kernel internal hardware * dependent format. * * This function ultimately has three different callers with distinct PKRU * behavior. * 1. When called from sigreturn the PKRU register will be restored from * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to * @fpstate is sufficient to cover this case, but the caller will also * pass a pointer to the thread_struct's pkru field in @pkru and updating * it is harmless. * 2. When called from ptrace the PKRU register will be restored from the * thread_struct's pkru field. A pointer to that is passed in @pkru. * The kernel will restore it manually, so the XRSTOR behavior that resets * the PKRU register to the hardware init value (0) if the corresponding * xfeatures bit is not set is emulated here. * 3. When called from KVM the PKRU register will be restored from the vcpu's * pkru field. A pointer to that is passed in @pkru. KVM hasn't used * XRSTOR and hasn't had the PKRU resetting behavior described above. To * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures * bit is not set. */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; int i; offset = offsetof(struct xregs_state, header); if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; if (hdr.xfeatures & mask) { u32 mxcsr[2]; offset = offsetof(struct fxregs_state, mxcsr); if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) return -EFAULT; /* Reserved bits in MXCSR must be zero. */ if (mxcsr[0] & ~mxcsr_feature_mask) return -EINVAL; /* SSE and YMM require MXCSR even when FP is not in use. */ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { xsave->i387.mxcsr = mxcsr[0]; xsave->i387.mxcsr_mask = mxcsr[1]; } } for (i = 0; i < XFEATURE_MAX; i++) { mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) return -EFAULT; } } if (hdr.xfeatures & XFEATURE_MASK_PKRU) { struct pkru_state *xpkru; xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); *pkru = xpkru->pkru; } else { /* * KVM may pass NULL here to indicate that it does not need * PKRU updated. */ if (pkru) *pkru = 0; } /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: */ xsave->header.xfeatures |= hdr.xfeatures; return 0; } /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; return true; } /** * xsaves - Save selected components to a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to save * * The @xstate buffer must be 64 byte aligned and correctly initialized as * XSAVES does not write the full xstate header. Before first use the * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } /** * xrstors - Restore selected components from a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to restore * * The @xstate buffer must be 64 byte aligned and correctly initialized * otherwise XRSTORS from that buffer can #GP. * * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) { void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); } EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU /* * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask * can safely operate on the @fpstate buffer. */ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { u64 xfd = __this_cpu_read(xfd_state); if (fpstate->xfd == xfd) return true; /* * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ if (fpstate->xfd == current->thread.fpu.fpstate->xfd) return false; /* * XRSTOR(S) from init_fpstate are always correct as it will just * bring all components into init state and not read from the * buffer. XSAVE(S) raises #PF after init. */ if (fpstate == &init_fpstate) return rstor; /* * XSAVE(S): clone(), fpu_swap_kvm_fpstate() * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* * No XSAVE/XRSTOR instructions (except XSAVE itself) touch * the buffer area for XFD-disabled state components. */ mask &= ~xfd; /* * Remove features which are valid in fpstate. They * have space allocated in fpstate. */ mask &= ~fpstate->xfeatures; /* * Any remaining state components in 'mask' might be written * by XSAVE/XRSTOR. Fail validation it found. */ return !mask; } void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); } #endif /* CONFIG_X86_DEBUG_FPU */ static int __init xfd_update_static_branch(void) { /* * If init_fpstate.xfd has bits set then dynamic features are * available and the dynamic sizing must be enabled. */ if (init_fpstate.xfd) static_branch_enable(&__fpu_state_size_dynamic); return 0; } arch_initcall(xfd_update_static_branch) void fpstate_free(struct fpu *fpu) { if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } /** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks * with large states are likely to live longer. * * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = ¤t->thread.fpu; struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); if (!newfps) return -ENOMEM; newfps->size = ksize; newfps->user_size = usize; newfps->is_valloc = true; /* * When a guest FPU is supplied, use @guest_fpu->fpstate * as reference independent whether it is in use or not. */ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; /* Determine whether @curfps is the active fpstate */ in_use = fpu->fpstate == curfps; if (guest_fpu) { newfps->is_guest = true; newfps->is_confidential = curfps->is_confidential; newfps->in_use = curfps->in_use; guest_fpu->xfeatures |= xfeatures; guest_fpu->uabi_size = usize; } fpregs_lock(); /* * If @curfps is in use, ensure that the current state is in the * registers before swapping fpstate as that might invalidate it * due to layout changes. */ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); if (guest_fpu) { guest_fpu->fpstate = newfps; /* If curfps is active, update the FPU fpstate pointer */ if (in_use) fpu->fpstate = newfps; } else { fpu->fpstate = newfps; } if (in_use) xfd_update_state(fpu->fpstate); fpregs_unlock(); /* Only free valloc'ed state */ if (curfps && curfps->is_valloc) vfree(curfps); return 0; } static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; unsigned long framesize = get_sigframe_size(); lockdep_assert_held(¤t->sighand->siglock); /* get_sigframe_size() is based on fpu_user_cfg.max_size */ framesize -= fpu_user_cfg.max_size; framesize += usize; for_each_thread(leader, thread) { if (thread->sas_ss_size && thread->sas_ss_size < framesize) return -ENOSPC; } return 0; } static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon * vendors into extending XFD for the pre AMX states, especially * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) return 0; /* Calculate the resulting kernel state size */ mask = permitted | requested; /* Take supervisor states into account on the host */ if (!guest) mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); /* Calculate the resulting user state size */ mask &= XFEATURE_MASK_USER_SUPPORTED; usize = xstate_calculate_size(mask, false); if (!guest) { ret = validate_sigaltstack(usize); if (ret) return ret; } perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ perm->__state_size = ksize; perm->__user_state_size = usize; return ret; } /* * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; if (idx >= XFEATURE_MAX) return -EINVAL; /* * Look up the facility mask which can require more than * one xstate component. */ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); requested = xstate_prctl_req[idx]; if (!requested) return -EOPNOTSUPP; if ((fpu_user_cfg.max_features & requested) != requested) return -EOPNOTSUPP; /* Lockless quick check */ permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(¤t->sighand->siglock); permitted = xstate_get_group_perm(guest); /* First vCPU allocation locks the permissions. */ if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) ret = -EBUSY; else ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(¤t->sighand->siglock); return ret; } int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { if (!guest_fpu) pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } /* Protect against concurrent modifications */ spin_lock_irq(¤t->sighand->siglock); /* If not permitted let it die */ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(¤t->sighand->siglock); return -EPERM; } fpu = ¤t->group_leader->thread.fpu; perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from * another task, the retrieved buffer sizes are valid for the * currently requested feature(s). */ spin_unlock_irq(¤t->sighand->siglock); /* * Try to allocate a new fpstate. If that fails there is no way * out. */ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } int xfd_enable_feature(u64 xfd_err) { return __xfd_enable_feature(xfd_err, NULL); } #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code * * Option arguments: * * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info * ARCH_REQ_XCOMP_PERM: Facility number requested * * For facilities which require more than one XSTATE component, the request * must be the highest state component number related to that facility, * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; return put_user(supported, uptr); case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_REQ_XCOMP_GUEST_PERM: guest = true; fallthrough; case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; return xstate_request_perm(idx, guest); default: return -EINVAL; } } #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 * use in the task. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); long delta; if (!timestamp) { /* * Report -1 if no AVX512 usage */ delta = -1; } else { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX */ if (delta < 0) delta = LONG_MAX; delta = jiffies_to_msecs(delta); } seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); seq_putc(m, '\n'); } /* * Report architecture specific information */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { /* * Report AVX512 state if the processor and build option supported. */ if (cpu_feature_enabled(X86_FEATURE_AVX512F)) avx512_status(m, task); return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ #ifdef CONFIG_COREDUMP static const char owner_name[] = "LINUX"; /* * Dump type, size, offset and flag values for every xfeature that is present. */ static int dump_xsave_layout_desc(struct coredump_params *cprm) { int num_records = 0; int i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) { struct x86_xfeat_component xc = { .type = i, .size = xstate_sizes[i], .offset = xstate_offsets[i], /* reserved for future use */ .flags = 0, }; if (!dump_emit(cprm, &xc, sizeof(xc))) return 0; num_records++; } return num_records; } static u32 get_xsave_desc_size(void) { u32 cnt = 0; u32 i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) cnt++; return cnt * (sizeof(struct x86_xfeat_component)); } int elf_coredump_extra_notes_write(struct coredump_params *cprm) { int num_records = 0; struct elf_note en; if (!fpu_user_cfg.max_features) return 0; en.n_namesz = sizeof(owner_name); en.n_descsz = get_xsave_desc_size(); en.n_type = NT_X86_XSAVE_LAYOUT; if (!dump_emit(cprm, &en, sizeof(en))) return 1; if (!dump_emit(cprm, owner_name, en.n_namesz)) return 1; if (!dump_align(cprm, 4)) return 1; num_records = dump_xsave_layout_desc(cprm); if (!num_records) return 1; /* Total size should be equal to the number of records */ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) return 1; return 0; } int elf_coredump_extra_notes_size(void) { int size; if (!fpu_user_cfg.max_features) return 0; /* .note header */ size = sizeof(struct elf_note); /* Name plus alignment to 4 bytes */ size += roundup(sizeof(owner_name), 4); size += get_xsave_desc_size(); return size; } #endif /* CONFIG_COREDUMP */ |
199 201 199 199 200 201 199 200 200 387 156 156 1 155 1 154 137 1 137 524 1 522 525 324 318 57 4 4 4 520 520 519 506 14 300 352 511 8 1 510 9 518 412 102 54 4 351 1 125 225 333 17 307 58 5 285 246 242 5 238 134 105 239 3 237 237 12 226 155 74 132 98 239 202 1 16 185 186 193 133 425 33 1 377 24 32 7 1 396 237 312 323 324 121 76 77 269 8 130 72 72 302 41 41 40 1 41 50 15 38 38 4 2 1 13 22 33 428 1382 344 1260 1266 1265 2 2 2 209 45 166 166 81 171 201 2 183 31 49 83 94 21 268 187 1 30 1 11 2 90 8 81 221 220 3 1 231 577 44 44 44 44 1 43 43 42 7 43 43 3 8 5 34 9 9 25 34 34 34 56 56 55 12 2 8 2 36 36 36 36 4 4 30 9 4 5 132 132 132 132 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PF_INET protocol family socket handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Changes (see also sock.c) * * piggy, * Karl Knutson : Socket protocol table * A.N.Kuznetsov : Socket death error in accept(). * John Richardson : Fix non blocking error in connect() * so sockets that fail to connect * don't return -EINPROGRESS. * Alan Cox : Asynchronous I/O support * Alan Cox : Keep correct socket pointer on sock * structures * when accept() ed * Alan Cox : Semantics of SO_LINGER aren't state * moved to close when you look carefully. * With this fixed and the accept bug fixed * some RPC stuff seems happier. * Niibe Yutaka : 4.4BSD style write async I/O * Alan Cox, * Tony Gale : Fixed reuse semantics. * Alan Cox : bind() shouldn't abort existing but dead * sockets. Stops FTP netin:.. I hope. * Alan Cox : bind() works correctly for RAW sockets. * Note that FreeBSD at least was broken * in this respect so be careful with * compatibility tests... * Alan Cox : routing cache support * Alan Cox : memzero the socket structure for * compactness. * Matt Day : nonblock connect error handler * Alan Cox : Allow large numbers of pending sockets * (eg for big web sites), but only if * specifically application requested. * Alan Cox : New buffering throughout IP. Used * dumbly. * Alan Cox : New buffering now used smartly. * Alan Cox : BSD rather than common sense * interpretation of listen. * Germano Caronni : Assorted small races. * Alan Cox : sendmsg/recvmsg basic support. * Alan Cox : Only sendmsg/recvmsg now supported. * Alan Cox : Locked down bind (see security list). * Alan Cox : Loosened bind a little. * Mike McLagan : ADD/DEL DLCI Ioctls * Willy Konynenberg : Transparent proxying support. * David S. Miller : New socket lookup architecture. * Some other random speedups. * Cyrus Durgin : Cleaned up file for kmod hacks. * Andi Kleen : Fix inet_stream_connect TCP race. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/err.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/sched.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/checksum.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/inet_connection_sock.h> #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/ip_tunnels.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #include <net/l3mdev.h> #include <net/compat.h> #include <net/rps.h> #include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); /* New destruction routine */ void inet_sock_destruct(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim_final(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { pr_err("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); return; } if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive inet socket %p\n", sk); return; } WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); WARN_ON_ONCE(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); } EXPORT_SYMBOL(inet_sock_destruct); /* * The routines beyond this point handle the behaviour of an AF_INET * socket object. Mostly it punts to the subprotocols of IP to do * the work. */ /* * Automatically bind an unbound socket. */ static int inet_autobind(struct sock *sk) { struct inet_sock *inet; /* We may need to bind the socket. */ lock_sock(sk); inet = inet_sk(sk); if (!inet->inet_num) { if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; } int __inet_listen_sk(struct sock *sk, int backlog) { unsigned char old_state = sk->sk_state; int err, tcp_fastopen; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) return -EINVAL; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. * Also fastopen backlog may already been set via the option * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk); if (err) return err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } return 0; } /* * Move a socket into listening state. */ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = -EINVAL; lock_sock(sk); if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; err = __inet_listen_sk(sk, backlog); out: release_sock(sk); return err; } EXPORT_SYMBOL(inet_listen); /* * Create an inet socket. */ static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; if (INET_PROTOSW_ICSK & answer_flags) inet_init_csk_locks(sk); inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); inet_clear_bit(NODEFRAG, sk); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet_set_bit(HDRINCL, sk); } if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; atomic_set(&inet->inet_id, 0); sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet_set_bit(MC_ALL, sk); inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); if (err) goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); if (err) goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; out_sk_release: sk_common_release(sk); sock->sk = NULL; goto out; } /* * The peer socket should always be NULL (or else). When we call this * function we are destroying the object and from then on nobody * should refer to it. */ int inet_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { long timeout; if (!sk->sk_kern_sock) BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); /* If linger is set, we don't return until the close * is complete. Otherwise we return immediately. The * actually closing is done the same either way. * * If the close is due to the process exiting, we never * linger.. */ timeout = 0; if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) timeout = sk->sk_lingertime; sk->sk_prot->close(sk, timeout); sock->sk = NULL; } return 0; } EXPORT_SYMBOL(inet_release); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { return sk->sk_prot->bind(sk, uaddr, addr_len); } if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, CGROUP_INET4_BIND, &flags); if (err) return err; return __inet_bind(sk, uaddr, addr_len, flags); } int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { return inet_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; u32 tb_id = RT_TABLE_LOCAL; int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. */ err = -EAFNOSUPPORT; if (addr->sin_family != AF_UNSPEC || addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; } tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link * is temporarily down) */ err = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; snum = ntohs(addr->sin_port); err = -EACCES; if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; /* We keep a pair of addresses. rcv_saddr is the one * used by hash lookups, and saddr is used for transmit. * * In the BSD API these are the same except where it * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; goto out_release_sock; } if (!(flags & BIND_FROM_BPF)) { err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; if (sk->sk_prot->put_port) sk->sk_prot->put_port(sk); goto out_release_sock; } } } if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_daddr = 0; inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; } int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (uaddr->sa_family == AF_UNSPEC) return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. * Connect() does not allow to get error notifications * without closing the socket. */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; /* * uaddr can be NULL and addr_len can be 0 if: * sk is a TCP fastopen active socket and * TCP_FASTOPEN_CONNECT sockopt is set and * we already have a valid cookie for this socket. * In this case, user can call write() after connect(). * write() will invoke tcp_sendmsg_fastopen() which calls * __inet_stream_connect(). */ if (uaddr) { if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: if (inet_test_bit(DEFER_CONNECT, sk)) err = is_sendmsg ? -EINPROGRESS : -EISCONN; else err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); if (err) goto out; } err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; sock->state = SS_CONNECTING; if (!err && inet_test_bit(DEFER_CONNECT, sk)) goto out; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; if (dis != sk->sk_disconnects) { err = -EPIPE; goto out; } } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; } EXPORT_SYMBOL(__inet_stream_connect); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); return err; } EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_CLOSE))); if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) set_bit(SOCK_SUPPORT_ZC, &newsock->flags); sock_graft(newsk, newsock); newsock->state = SS_CONNECTED; } /* * Accept a pending connection. The TCP layer now gives BSD semantics. */ int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk1 = sock->sk, *sk2; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ arg->err = -EINVAL; sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg); if (!sk2) return arg->err; lock_sock(sk2); __inet_accept(sock, newsock, sk2); release_sock(sk2); return 0; } EXPORT_SYMBOL(inet_accept); /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); int sin_addr_len = sizeof(*sin); sin->sin_family = AF_INET; lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) { release_sock(sk); return -ENOTCONN; } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sin_addr_len; } EXPORT_SYMBOL(inet_getname); int inet_send_prepare(struct sock *sk) { sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && inet_autobind(sk)) return -EAGAIN; return 0; } EXPORT_SYMBOL_GPL(inet_send_prepare); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); void inet_splice_eof(struct socket *sock) { const struct proto *prot; struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (prot->splice_eof) prot->splice_eof(sock); } EXPORT_SYMBOL_GPL(inet_splice_eof); INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(inet_recvmsg); int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) */ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and 1->2 bit 2 snds. 2->3 */ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTING) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } switch (sk->sk_state) { case TCP_CLOSE: err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ fallthrough; default: WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; /* Remaining two branches are temporary solution for missing * close() in multithreaded environment. It is _not_ a good idea, * but we have no choice until close() is repaired at VFS level. */ case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; break; } /* Wake up anyone sleeping in poll. */ sk->sk_state_change(sk); release_sock(sk); return err; } EXPORT_SYMBOL(inet_shutdown); /* * ioctl() calls you can issue on an INET socket. Most of these are * device configuration and stuff and very rarely used. Some ioctls * pass on to the socket itself. * * NOTE: I like the idea of a module for the config stuff. ie ifconfig * loads the devconfigure module does its configuring and unloads it. * There's a good 20K of config code hanging around the kernel. */ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; int err = 0; struct net *net = sock_net(sk); void __user *p = (void __user *)arg; struct ifreq ifr; struct rtentry rt; switch (cmd) { case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) return -EFAULT; err = ip_rt_ioctl(net, cmd, &rt); break; case SIOCRTMSG: err = -EINVAL; break; case SIOCDARP: case SIOCGARP: case SIOCSARP: err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; default: if (sk->sk_prot->ioctl) err = sk_ioctl(sk, cmd, (void __user *)arg); else err = -ENOIOCTLCMD; break; } return err; } EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, struct compat_rtentry __user *ur) { compat_uptr_t rtdev; struct rtentry rt; if (copy_from_user(&rt.rt_dst, &ur->rt_dst, 3 * sizeof(struct sockaddr)) || get_user(rt.rt_flags, &ur->rt_flags) || get_user(rt.rt_metric, &ur->rt_metric) || get_user(rt.rt_mtu, &ur->rt_mtu) || get_user(rt.rt_window, &ur->rt_window) || get_user(rt.rt_irtt, &ur->rt_irtt) || get_user(rtdev, &ur->rt_dev)) return -EFAULT; rt.rt_dev = compat_ptr(rtdev); return ip_rt_ioctl(sock_net(sk), cmd, &rt); } static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; switch (cmd) { case SIOCADDRT: case SIOCDELRT: return inet_compat_routing_ioctl(sk, cmd, argp); default: if (!sk->sk_prot->compat_ioctl) return -ENOIOCTLCMD; return sk->sk_prot->compat_ioctl(sk, cmd, arg); } } #endif /* CONFIG_COMPAT */ const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, .set_peek_off = udp_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, }; /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) break; if (protocol == answer->protocol) goto out_permanent; last_perm = lh; } /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw); void inet_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw_lock); list_del_rcu(&p->list); spin_unlock_bh(&inetsw_lock); synchronize_net(); } } EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; __be32 new_saddr; struct ip_options_rcu *inet_opt; int err; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* Query new route. */ fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if, sk->sk_protocol, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); new_saddr = fl4->saddr; if (new_saddr == old_saddr) { sk_setup_caps(sk, &rt->dst); return 0; } err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { ip_rt_put(rt); return err; } sk_setup_caps(sk, &rt->dst); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } /* * XXX The only one ugly spot where we need to * XXX really change the sockets identity after * XXX it has entered the hashes. -DaveM * * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) { struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ if (rt) return 0; /* Reroute. */ fl4 = &inet->cork.fl.u.ip4; inet_sk_init_flowi4(inet, fl4); rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); } else { err = PTR_ERR(rt); /* Routing failed... */ sk->sk_route_caps = 0; /* * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) WRITE_ONCE(sk->sk_err_soft, -err); } return err; } EXPORT_SYMBOL(inet_sk_rebuild_header); void inet_sk_set_state(struct sock *sk, int state) { trace_inet_sock_set_state(sk, sk->sk_state, state); sk->sk_state = state; } EXPORT_SYMBOL(inet_sk_set_state); void inet_sk_state_store(struct sock *sk, int newstate) { trace_inet_sock_set_state(sk, sk->sk_state, newstate); smp_store_release(&sk->sk_state, newstate); } struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; int ihl; int id; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; id = ntohs(iph->id); proto = iph->protocol; /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; __skb_pull(skb, ihl); encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (udpfrag) { iph->frag_off = htons(offset >> 3); if (skb->next) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; tot_len = skb->len - nhoff; } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } if (gso_partial) tot_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)iph; else tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); tot_len = skb->len - nhoff; } iph->tot_len = htons(tot_len); ip_send_check(iph); if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; skb_reset_mac_len(skb); } while ((skb = skb->next)); out: return segs; } static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; const struct iphdr *iph; struct sk_buff *p; unsigned int hlen; unsigned int off; int flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; proto = iph->protocol; ops = rcu_dereference(inet_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; if (*(u8 *)iph != 0x45) goto out; if (ip_is_fragment(iph)) goto out; if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; NAPI_GRO_CB(skb)->proto = proto; flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF)); list_for_each_entry(p, head, list) { struct iphdr *iph2; if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct iphdr *)(p->data + off); /* The above works because, with the exception of the top * (inner most) layer, we only aggregate pkts with the same * hdr length so all the hdrs we'll need to verify will start * at the same offset. */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 */ skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } #define SECONDS_PER_DAY 86400 /* inet_current_timestamp - Return IP network timestamp * * Return milliseconds since midnight in network byte order. */ __be32 inet_current_timestamp(void) { u32 secs; u32 msecs; struct timespec64 ts; ktime_get_real_ts64(&ts); /* Get secs since midnight. */ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); /* Convert to msecs. */ msecs = secs * MSEC_PER_SEC; /* Convert nsec to msec. */ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; /* Convert to network byte order. */ return htonl(msecs); } EXPORT_SYMBOL(inet_current_timestamp); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { unsigned int family = READ_ONCE(sk->sk_family); if (family == AF_INET) return ip_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); #endif return -EINVAL; } EXPORT_SYMBOL(inet_recv_error); int inet_gro_complete(struct sk_buff *skb, int nhoff) { struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); skb_set_inner_network_header(skb, nhoff); } iph_set_totlen(iph, skb->len - nhoff); csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; /* Only need to add sizeof(*iph) to get to the next hdr below * because any hdr with option will have been flushed in * inet_gro_receive(). */ err = INDIRECT_CALL_2(ops->callbacks.gro_complete, tcp4_gro_complete, udp4_gro_complete, skb, nhoff + sizeof(*iph)); out: return err; } static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return inet_gro_complete(skb, nhoff); } int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net) { struct socket *sock; int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); } return rc; } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, size_t syncp_offset) { void *bhptr; struct u64_stats_sync *syncp; u64 v; unsigned int start; bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin(syncp); v = *(((u64 *)bhptr) + offt); } while (u64_stats_fetch_retry(syncp, start)); return v; } EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, }; #endif static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) { int i; net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); if (!net->mib.tcp_statistics) goto err_tcp_mib; net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); } net->mib.net_statistics = alloc_percpu(struct linux_mib); if (!net->mib.net_statistics) goto err_net_mib; net->mib.udp_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udp_statistics) goto err_udp_mib; net->mib.udplite_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udplite_statistics) goto err_udplite_mib; net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); if (!net->mib.icmpmsg_statistics) goto err_icmpmsg_mib; tcp_mib_init(net); return 0; err_icmpmsg_mib: free_percpu(net->mib.icmp_statistics); err_icmp_mib: free_percpu(net->mib.udplite_statistics); err_udplite_mib: free_percpu(net->mib.udp_statistics); err_udp_mib: free_percpu(net->mib.net_statistics); err_net_mib: free_percpu(net->mib.ip_statistics); err_ip_mib: free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); free_percpu(net->mib.icmp_statistics); free_percpu(net->mib.udplite_statistics); free_percpu(net->mib.udp_statistics); free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); #ifdef CONFIG_MPTCP /* allocated on demand, see mptcp_init_sock() */ free_percpu(net->mib.mptcp_statistics); #endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, }; static int __init init_ipv4_mibs(void) { return register_pernet_subsys(&ipv4_mib_ops); } static __net_init int inet_init_net(struct net *net) { /* * Set defaults for local port range */ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u; seqlock_init(&net->ipv4.ping_group_range.lock); /* * Sane defaults - nobody may create ping sockets. * Boot scripts should set this to distro-specific group. */ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); /* Default values for sysctl-controlled parameters. * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif /* Some igmp sysctl, whose values are always used */ net->ipv4.sysctl_igmp_max_memberships = 20; net->ipv4.sysctl_igmp_max_msf = 10; /* IGMP reports for link-local multicast groups are enabled by default */ net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; net->ipv4.sysctl_fib_notify_on_flag_change = 0; return 0; } static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, }; static int __init init_inet_pernet_ops(void) { return register_pernet_subsys(&af_inet_ops); } static int ipv4_proc_init(void); /* * IP protocol layer initialiser */ static const struct net_offload ipip_offload = { .callbacks = { .gso_segment = ipip_gso_segment, .gro_receive = ipip_gro_receive, .gro_complete = ipip_gro_complete, }, }; static int __init ipip_offload_init(void) { return inet_add_offload(&ipip_offload, IPPROTO_IPIP); } static int __init ipv4_offload_init(void) { /* * Add offloads */ if (udpv4_offload_init() < 0) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (tcpv4_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipip_offload_init() < 0) pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); net_hotdata.ip_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IP), .callbacks = { .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, }, }; dev_add_offload(&net_hotdata.ip_packet_offload); return 0; } fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .list_func = ip_list_rcv, }; static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); raw_hashinfo_init(&raw_v4_hashinfo); rc = proto_register(&tcp_prot, 1); if (rc) goto out; rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto; /* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif /* * Add all the base protocols. */ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); net_hotdata.udp_protocol = (struct net_protocol) { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, }; if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); net_hotdata.tcp_protocol = (struct net_protocol) { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); /* * Set the ARP module up */ arp_init(); /* * Set the IP module up */ ip_init(); /* Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) panic("%s: Cannot init ipv4 mibs\n", __func__); /* Setup TCP slab cache for open requests. */ tcp_init(); /* Setup UDP memory threshold */ udp_init(); /* Add UDP-Lite (RFC 3828) */ udplite4_register(); raw_init(); ping_init(); /* * Set the ICMP layer up */ if (icmp_init() < 0) panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); ipv4_proc_init(); ipfrag_init(); dev_add_pack(&ip_packet_type); ip_tunnel_core_init(); rc = 0; out: return rc; out_unregister_raw_proto: proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); goto out; } fs_initcall(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static int __init ipv4_proc_init(void) { int rc = 0; if (raw_proc_init()) goto out_raw; if (tcp4_proc_init()) goto out_tcp; if (udp4_proc_init()) goto out_udp; if (ping_proc_init()) goto out_ping; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: ping_proc_exit(); out_ping: udp4_proc_exit(); out_udp: tcp4_proc_exit(); out_tcp: raw_proc_exit(); out_raw: rc = -ENOMEM; goto out; } #else /* CONFIG_PROC_FS */ static int __init ipv4_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ |
83 13 41 55 405 50 277 175 175 3 4036 1975 11 8801 8813 2556 362 1120 5 1 41 12275 2682 17 2163 2169 2542 134 16 783 68 22 1 25 12 88 19 590 54 1 24 7 3 1 2 19 1 7 2 43 24 1 3 2120 475 2296 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NETLINK_H #define __NET_NETLINK_H #include <linux/types.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/in6.h> /* ======================================================================== * Netlink Messages and Attributes Interface (As Seen On TV) * ------------------------------------------------------------------------ * Messages Interface * ------------------------------------------------------------------------ * * Message Format: * <--- nlmsg_total_size(payload) ---> * <-- nlmsg_msg_size(payload) -> * +----------+- - -+-------------+- - -+-------- - - * | nlmsghdr | Pad | Payload | Pad | nlmsghdr * +----------+- - -+-------------+- - -+-------- - - * nlmsg_data(nlh)---^ ^ * nlmsg_next(nlh)-----------------------+ * * Payload Format: * <---------------------- nlmsg_len(nlh) ---------------------> * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> * +----------------------+- - -+--------------------------------+ * | Family Header | Pad | Attributes | * +----------------------+- - -+--------------------------------+ * nlmsg_attrdata(nlh, hdrlen)---^ * * Data Structures: * struct nlmsghdr netlink message header * * Message Construction: * nlmsg_new() create a new netlink message * nlmsg_put() add a netlink message to an skb * nlmsg_put_answer() callback based nlmsg_put() * nlmsg_end() finalize netlink message * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction * nlmsg_consume() free a netlink message (expected) * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups * nlmsg_unicast() unicast a message to a single socket * nlmsg_notify() send notification message * * Message Length Calculations: * nlmsg_msg_size(payload) length of message w/o padding * nlmsg_total_size(payload) length of message w/ padding * nlmsg_padlen(payload) length of padding at tail * * Message Payload Access: * nlmsg_data(nlh) head of message payload * nlmsg_len(nlh) length of message payload * nlmsg_attrdata(nlh, hdrlen) head of attributes data * nlmsg_attrlen(nlh, hdrlen) length of attributes data * * Message Parsing: * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? * nlmsg_next(nlh, remaining) get next netlink message * nlmsg_parse() parse attributes of a message * nlmsg_find_attr() find an attribute in a message * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes * * Misc: * nlmsg_report() report back to application? * * ------------------------------------------------------------------------ * Attributes Interface * ------------------------------------------------------------------------ * * Attribute Format: * <------- nla_total_size(payload) -------> * <---- nla_attr_size(payload) -----> * +----------+- - -+- - - - - - - - - +- - -+-------- - - * | Header | Pad | Payload | Pad | Header * +----------+- - -+- - - - - - - - - +- - -+-------- - - * <- nla_len(nla) -> ^ * nla_data(nla)----^ | * nla_next(nla)-----------------------------' * * Data Structures: * struct nlattr netlink attribute header * * Attribute Construction: * nla_reserve(skb, type, len) reserve room for an attribute * nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr * nla_put(skb, type, len, data) add attribute to skb * nla_put_nohdr(skb, len, data) add attribute w/o hdr * nla_append(skb, len, data) append data to skb * * Attribute Construction for Basic Types: * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb * nla_put_u64_64bit(skb, type, * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb * nla_put_s64(skb, type, value, * padattr) add s64 attribute to skb * nla_put_string(skb, type, str) add string attribute to skb * nla_put_flag(skb, type) add flag attribute to skb * nla_put_msecs(skb, type, jiffies, * padattr) add msecs attribute to skb * nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb * nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb * * Nested Attributes Construction: * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding * nla_total_size(payload) length of attribute w/ padding * nla_padlen(payload) length of padding * * Attribute Payload Access: * nla_data(nla) head of attribute payload * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: * nla_get_uint(nla) get payload for a uint attribute * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute * nla_get_u64(nla) get payload for a u64 attribute * nla_get_s8(nla) get payload for a s8 attribute * nla_get_s16(nla) get payload for a s16 attribute * nla_get_s32(nla) get payload for a s32 attribute * nla_get_s64(nla) get payload for a s64 attribute * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * * The same functions also exist with _default(). * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: * nla_ok(nla, remaining) does nla fit into remaining bytes? * nla_next(nla, remaining) get next netlink attribute * nla_validate() validate a stream of attributes * nla_validate_nested() validate a stream of nested attributes * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_attr_type() loop over all attributes with the * given type * nla_for_each_nested() loop over the nested attributes * nla_for_each_nested_type() loop over the nested attributes with * the given type *========================================================================= */ /** * Standard attribute types to specify validation policy */ enum { NLA_UNSPEC, NLA_U8, NLA_U16, NLA_U32, NLA_U64, NLA_STRING, NLA_FLAG, NLA_MSECS, NLA_NESTED, NLA_NESTED_ARRAY, NLA_NUL_STRING, NLA_BINARY, NLA_S8, NLA_S16, NLA_S32, NLA_S64, NLA_BITFIELD32, NLA_REJECT, NLA_BE16, NLA_BE32, NLA_SINT, NLA_UINT, __NLA_TYPE_MAX, }; #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) struct netlink_range_validation { u64 min, max; }; struct netlink_range_validation_signed { s64 min, max; }; enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; /** * struct nla_policy - attribute validation policy * @type: Type of attribute or NLA_UNSPEC * @validation_type: type of attribute validation done in addition to * type-specific validation (e.g. range, function call), see * &enum nla_policy_validation * @len: Type specific length of payload * * Policies are defined as arrays of this struct, the array must be * accessible by attribute type up to the highest identifier to be expected. * * Meaning of `len' field: * NLA_STRING Maximum length of string * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused * All other Minimum length of attribute payload * * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and * `bitfield32_valid' is the u32 value of valid flags * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. * NLA_NESTED `nested_policy' to a nested policy to validate, must * also set `len' to the max attribute number. Use the * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, * must also set `len' to the max attribute number. Use * the provided NLA_POLICY_NESTED_ARRAY() macro. * The difference to NLA_NESTED is the structure: * NLA_NESTED has the nested attributes directly inside * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 The `min' and `max' fields are used depending on the * validation_type field, if that is min/max/range then * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that using the NLA_POLICY_FULL_RANGE() * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). * * NLA_BINARY If the validation type is like the ones for integers * above, then the min/max length (not value like for * integers) of the attribute is enforced. * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: * * static const u32 myvalidflags = 0xff231023; * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { u8 type; u8 validation_type; u16 len; union { /** * @strict_start_type: first attribute to validate strictly * * This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to * get the previous pure { .len = xyz } behaviour. The advantage * of this is that types not specified in the policy will be * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones * it should be set at least when new attributes are added to * the enum used by the policy, and be set to the new value that * was added to enforce strict validation from thereon. */ u16 strict_start_type; /* private: use NLA_POLICY_*() to set */ const u32 bitfield32_valid; const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; const struct netlink_range_validation *range; const struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); }; }; #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ tp == NLA_U64 || tp == NLA_UINT || \ tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } #define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ .type = NLA_ENSURE_SINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range_signed = _range, \ } #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } #define NLA_POLICY_MASK(tp, _mask) { \ .type = NLA_ENSURE_UINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MASK, \ .mask = _mask, \ } #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ .validate = fn, \ .len = __VA_ARGS__ + 0, \ } #define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) #define NLA_POLICY_EXACT_LEN_WARN(_len) { \ .type = NLA_BINARY, \ .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ .min = _len, \ .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) #define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request * @nl_net: Network namespace * @portid: Netlink PORTID of requesting application * @skip_notify: Skip netlink notifications to user space * @skip_notify_kernel: Skip selected in-kernel notifications */ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; u8 skip_notify:1, skip_notify_kernel:1; }; /** * enum netlink_validation - netlink message/attribute validation levels * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about * extra data at the end of the message, attributes being longer than * they should be, or unknown attributes being present. * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. * This can safely be set by the kernel when the given policy has no * NLA_UNSPEC anymore, and can thus be used to ensure policy entries * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, NL_VALIDATE_TRAILING = BIT(0), NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE) #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ NL_VALIDATE_STRICT_ATTRS |\ NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen); struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen); void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_append(struct sk_buff *skb, int attrlen, const void *data); /************************************************************************** * Netlink Messages **************************************************************************/ /** * nlmsg_msg_size - length of netlink message not including padding * @payload: length of message payload */ static inline int nlmsg_msg_size(int payload) { return NLMSG_HDRLEN + payload; } /** * nlmsg_total_size - length of netlink message including padding * @payload: length of message payload */ static inline int nlmsg_total_size(int payload) { return NLMSG_ALIGN(nlmsg_msg_size(payload)); } /** * nlmsg_padlen - length of padding at the message's tail * @payload: length of message payload */ static inline int nlmsg_padlen(int payload) { return nlmsg_total_size(payload) - nlmsg_msg_size(payload); } /** * nlmsg_data - head of message payload * @nlh: netlink message header */ static inline void *nlmsg_data(const struct nlmsghdr *nlh) { return (unsigned char *) nlh + NLMSG_HDRLEN; } /** * nlmsg_len - length of message payload * @nlh: netlink message header */ static inline int nlmsg_len(const struct nlmsghdr *nlh) { return nlh->nlmsg_len - NLMSG_HDRLEN; } /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen) { unsigned char *data = nlmsg_data(nlh); return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); } /** * nlmsg_attrlen - length of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) { return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); } /** * nlmsg_ok - check if the netlink message fits into the remaining bytes * @nlh: netlink message header * @remaining: number of bytes remaining in message stream */ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) { return (remaining >= (int) sizeof(struct nlmsghdr) && nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining); } /** * nlmsg_next - next netlink message in message stream * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * nlmsg_next(const struct nlmsghdr *nlh, int *remaining) { int totlen = NLMSG_ALIGN(nlh->nlmsg_len); *remaining -= totlen; return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } /** * nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy, validate, extack); } /** * nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_parse_deprecated_strict - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() */ static inline int nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), attrtype); } /** * nla_validate_deprecated - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, int hdrlen, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return __nla_validate(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_report - need to report back to application? * @nlh: netlink message header * * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } /** * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { return nlh ? nlh->nlmsg_seq : 0; } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application * @seq: sequence number of message * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int payload, int flags) { if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) return NULL; return __nlmsg_put(skb, portid, seq, type, payload, flags); } /** * nlmsg_append - Add more data to a nlmsg in a skb * @skb: socket buffer to store message in * @size: length of message payload * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) { if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) return NULL; if (NLMSG_ALIGN(size) - size) memset(skb_tail_pointer(skb) + size, 0, NLMSG_ALIGN(size) - size); return __skb_put(skb, NLMSG_ALIGN(size)); } /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, struct netlink_callback *cb, int type, int payload, int flags) { return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, type, payload, flags); } /** * nlmsg_new - Allocate a new netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. * * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known * and a good default is needed. */ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) { return alloc_skb(nlmsg_total_size(payload), flags); } /** * nlmsg_new_large - Allocate a new netlink message with non-contiguous * physical memory * @payload: size of the message payload * * The allocated skb is unable to have frag page for shinfo->frags*, * as the NULL setting for skb->head in netlink_skb_destructor() will * bypass most of the handling in skb_release_data() */ static inline struct sk_buff *nlmsg_new_large(size_t payload) { return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); } /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; } /** * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { return skb_tail_pointer(skb); } /** * nlmsg_trim - Trim message to a mark * @skb: socket buffer the message is stored in * @mark: mark to trim to * * Trims the message to the provided mark. */ static inline void nlmsg_trim(struct sk_buff *skb, const void *mark) { if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data); } } /** * nlmsg_cancel - Cancel construction of a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Removes the complete netlink message including all * attributes from the socket buffer again. */ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) { nlmsg_trim(skb, nlh); } /** * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) { kfree_skb(skb); } /** * nlmsg_consume - free a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_consume(struct sk_buff *skb) { consume_skb(skb); } /** * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; err = netlink_broadcast_filtered(sk, skb, portid, group, flags, filter, filter_data); if (err > 0) err = 0; return err; } /** * nlmsg_multicast - multicast a netlink message * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags */ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return nlmsg_multicast_filtered(sk, skb, portid, group, flags, NULL, NULL); } /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid) { int err; err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT); if (err > 0) err = 0; return err; } /** * nlmsg_for_each_msg - iterate over a stream of messages * @pos: loop counter, set to current message * @head: head of message stream * @len: length of message stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_msg(pos, head, len, rem) \ for (pos = head, rem = len; \ nlmsg_ok(pos, rem); \ pos = nlmsg_next(pos, &(rem))) /** * nl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @nlh: netlink message header to write the flag to * * This function checks if the sequence (generation) number changed during dump * and if it did, advertises it in the netlink message header. * * The correct way to use it is to set cb->seq to the generation counter when * all locks for dumping have been acquired, and then call this function for * each message that is generated. * * Note that due to initialisation concerns, 0 is an invalid sequence number * and must not be used by code that uses this functionality. */ static inline void nl_dump_check_consistent(struct netlink_callback *cb, struct nlmsghdr *nlh) { if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq; } /************************************************************************** * Netlink Attributes **************************************************************************/ /** * nla_attr_size - length of attribute not including padding * @payload: length of payload */ static inline int nla_attr_size(int payload) { return NLA_HDRLEN + payload; } /** * nla_total_size - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size(int payload) { return NLA_ALIGN(nla_attr_size(payload)); } /** * nla_padlen - length of padding at the tail of attribute * @payload: length of payload */ static inline int nla_padlen(int payload) { return nla_total_size(payload) - nla_attr_size(payload); } /** * nla_type - attribute type * @nla: netlink attribute */ static inline int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } /** * nla_data - head of payload * @nla: netlink attribute */ static inline void *nla_data(const struct nlattr *nla) { return (char *) nla + NLA_HDRLEN; } /** * nla_len - length of payload * @nla: netlink attribute */ static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } /** * nla_ok - check if the netlink attribute fits into the remaining bytes * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream */ static inline int nla_ok(const struct nlattr *nla, int remaining) { return remaining >= (int) sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } /** * nla_next - next netlink attribute in attribute stream * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); } /** * nla_find_nested - find attribute in a set of nested attributes * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) { return nla_find(nla_data(nla), nla_len(nla), attrtype); } /** * nla_parse_nested - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); return -EINVAL; } return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_put_u8 - Add a u8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { /* temporary variables to work around GCC PR81715 with asan-stack=1 */ u8 tmp = value; return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** * nla_put_u16 - Add a u16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { u16 tmp = value; return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { __le16 tmp = value; return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** * nla_put_u32 - Add a u32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { u32 tmp = value; return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** * nla_put_uint - Add a variable-size unsigned int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) { u64 tmp64 = value; u32 tmp32 = value; if (tmp64 == tmp32) return nla_put_u32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(u64), &tmp64); } /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { __le32 tmp = value; return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { u64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } /** * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { __le64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** * nla_put_s8 - Add a s8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { s8 tmp = value; return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** * nla_put_s16 - Add a s16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { s16 tmp = value; return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** * nla_put_s32 - Add a s32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { s32 tmp = value; return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { s64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** * nla_put_sint - Add a variable-size signed int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) { s64 tmp64 = value; s32 tmp32 = value; if (tmp64 == tmp32) return nla_put_s32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(s64), &tmp64); } /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @str: NUL terminated string */ static inline int nla_put_string(struct sk_buff *skb, int attrtype, const char *str) { return nla_put(skb, attrtype, strlen(str) + 1, str); } /** * nla_put_flag - Add a flag netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type */ static inline int nla_put_flag(struct sk_buff *skb, int attrtype) { return nla_put(skb, attrtype, 0, NULL); } /** * nla_put_msecs - Add a msecs netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @njiffies: number of jiffies to convert to msecs * @padattr: attribute type for the padding */ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, unsigned long njiffies, int padattr) { u64 tmp = jiffies_to_msecs(njiffies); return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv4 address */ static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { __be32 tmp = addr; return nla_put_be32(skb, attrtype, tmp); } /** * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv6 address */ static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, const struct in6_addr *addr) { return nla_put(skb, attrtype, sizeof(*addr), addr); } /** * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: value carrying bits * @selector: selector of valid bits */ static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype, __u32 value, __u32 selector) { struct nla_bitfield32 tmp = { value, selector, }; return nla_put(skb, attrtype, sizeof(tmp), &tmp); } /** * nla_get_u32 - return payload of u32 attribute * @nla: u32 netlink attribute */ static inline u32 nla_get_u32(const struct nlattr *nla) { return *(u32 *) nla_data(nla); } /** * nla_get_u32_default - return payload of u32 attribute or default * @nla: u32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) { if (!nla) return defvalue; return nla_get_u32(nla); } /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ static inline __be32 nla_get_be32(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_be32_default - return payload of be32 attribute or default * @nla: __be32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_be32_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_be32(nla); } /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ static inline __le32 nla_get_le32(const struct nlattr *nla) { return *(__le32 *) nla_data(nla); } /** * nla_get_le32_default - return payload of le32 attribute or default * @nla: __le32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le32 nla_get_le32_default(const struct nlattr *nla, __le32 defvalue) { if (!nla) return defvalue; return nla_get_le32(nla); } /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ static inline u16 nla_get_u16(const struct nlattr *nla) { return *(u16 *) nla_data(nla); } /** * nla_get_u16_default - return payload of u16 attribute or default * @nla: u16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) { if (!nla) return defvalue; return nla_get_u16(nla); } /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ static inline __be16 nla_get_be16(const struct nlattr *nla) { return *(__be16 *) nla_data(nla); } /** * nla_get_be16_default - return payload of be16 attribute or default * @nla: __be16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be16 nla_get_be16_default(const struct nlattr *nla, __be16 defvalue) { if (!nla) return defvalue; return nla_get_be16(nla); } /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ static inline __le16 nla_get_le16(const struct nlattr *nla) { return *(__le16 *) nla_data(nla); } /** * nla_get_le16_default - return payload of le16 attribute or default * @nla: __le16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le16 nla_get_le16_default(const struct nlattr *nla, __le16 defvalue) { if (!nla) return defvalue; return nla_get_le16(nla); } /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ static inline u8 nla_get_u8(const struct nlattr *nla) { return *(u8 *) nla_data(nla); } /** * nla_get_u8_default - return payload of u8 attribute or default * @nla: u8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) { if (!nla) return defvalue; return nla_get_u8(nla); } /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ static inline u64 nla_get_u64(const struct nlattr *nla) { u64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_u64_default - return payload of u64 attribute or default * @nla: u64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_u64(nla); } /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute */ static inline u64 nla_get_uint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(u32)) return nla_get_u32(nla); return nla_get_u64(nla); } /** * nla_get_uint_default - return payload of uint attribute or default * @nla: uint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_uint(nla); } /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ static inline __be64 nla_get_be64(const struct nlattr *nla) { __be64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_be64_default - return payload of be64 attribute or default * @nla: __be64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be64 nla_get_be64_default(const struct nlattr *nla, __be64 defvalue) { if (!nla) return defvalue; return nla_get_be64(nla); } /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ static inline __le64 nla_get_le64(const struct nlattr *nla) { return *(__le64 *) nla_data(nla); } /** * nla_get_le64_default - return payload of le64 attribute or default * @nla: __le64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le64 nla_get_le64_default(const struct nlattr *nla, __le64 defvalue) { if (!nla) return defvalue; return nla_get_le64(nla); } /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ static inline s32 nla_get_s32(const struct nlattr *nla) { return *(s32 *) nla_data(nla); } /** * nla_get_s32_default - return payload of s32 attribute or default * @nla: s32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) { if (!nla) return defvalue; return nla_get_s32(nla); } /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ static inline s16 nla_get_s16(const struct nlattr *nla) { return *(s16 *) nla_data(nla); } /** * nla_get_s16_default - return payload of s16 attribute or default * @nla: s16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) { if (!nla) return defvalue; return nla_get_s16(nla); } /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ static inline s8 nla_get_s8(const struct nlattr *nla) { return *(s8 *) nla_data(nla); } /** * nla_get_s8_default - return payload of s8 attribute or default * @nla: s8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) { if (!nla) return defvalue; return nla_get_s8(nla); } /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ static inline s64 nla_get_s64(const struct nlattr *nla) { s64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_s64_default - return payload of s64 attribute or default * @nla: s64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_s64(nla); } /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute */ static inline s64 nla_get_sint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(s32)) return nla_get_s32(nla); return nla_get_s64(nla); } /** * nla_get_sint_default - return payload of sint attribute or default * @nla: sint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_sint(nla); } /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ static inline int nla_get_flag(const struct nlattr *nla) { return !!nla; } /** * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { u64 msecs = nla_get_u64(nla); return msecs_to_jiffies((unsigned long) msecs); } /** * nla_get_msecs_default - return payload of msecs attribute or default * @nla: msecs netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, unsigned long defvalue) { if (!nla) return defvalue; return nla_get_msecs(nla); } /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ static inline __be32 nla_get_in_addr(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_in_addr_default - return payload of be32 attribute or default * @nla: IPv4 address netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_in_addr(nla); } /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) { struct in6_addr tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_bitfield32 - return payload of 32 bitfield attribute * @nla: nla_bitfield32 attribute */ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) { struct nla_bitfield32 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } #define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * This function exists for backward compatibility to use in APIs which never * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); if (nla_put(skb, attrtype, 0, NULL) < 0) return NULL; return start; } /** * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); } /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in * @start: container attribute * * Corrects the container attribute header to include the all * appended attributes. * * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len; } /** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute * * Removes the container attribute and including all nested * attributes. Returns -EMSGSIZE */ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) { nlmsg_trim(skb, start); } /** * nla_put_empty_nest - Create an empty nest * @skb: socket buffer the message is stored in * @attrtype: attribute type of the container * * This function is a helper for creating empty nests. * * Returns: 0 when successful or -EMSGSIZE on failure. */ static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) { return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; } /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, validate, extack); } static inline int nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); } static inline int nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* The nlattr header is 4 bytes in size, that's why we test * if the skb->data _is_ aligned. A NOP attribute, plus * nlattr header for next attribute, will make nla_data() * 8-byte aligned. */ if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) return true; #endif return false; } /** * nla_align_64bit - 64-bit align the nla_data() of next attribute * @skb: socket buffer the message is stored in * @padattr: attribute type for the padding * * Conditionally emit a padding netlink attribute in order to make * the next attribute we emit have a 64-bit aligned nla_data() area. * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { if (nla_need_padding_for_64bit(skb) && !nla_reserve(skb, padattr, 0)) return -EMSGSIZE; return 0; } /** * nla_total_size_64bit - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size_64bit(int payload) { return NLA_ALIGN(nla_attr_size(payload)) #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + NLA_ALIGN(nla_attr_size(0)) #endif ; } /** * nla_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) /** * nla_for_each_attr_type - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr_type(pos, type, head, len, rem) \ nla_for_each_attr(pos, head, len, rem) \ if (nla_type(pos) == type) /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** * nla_for_each_nested_type - iterate over nested attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested_type(pos, type, nla, rem) \ nla_for_each_nested(pos, nla, rem) \ if (nla_type(pos) == type) /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream */ static inline bool nla_is_last(const struct nlattr *nla, int rem) { return nla->nla_len == rem; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range); void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); struct netlink_policy_dump_state; int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype); int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif |
19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_BLA_H_ #define _NET_BATMAN_ADV_BLA_H_ #include "main.h" #include <linux/compiler.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> /** * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop * detect frame sent by bridge loop avoidance * @mac: mac address to check * * Return: true if the it looks like a loop detect frame * (mac starts with BA:BE), false otherwise */ static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac) { if (mac[0] == 0xba && mac[1] == 0xbe) return true; return false; } #ifdef CONFIG_BATMAN_ADV_BLA bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type); bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb); void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif); void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); #ifdef CONFIG_BATMAN_ADV_DAT bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); #endif #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ static inline bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type) { return false; } static inline bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { return false; } static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) { return false; } static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { return false; } static inline bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { return false; } static inline void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { } static inline int batadv_bla_init(struct batadv_priv *bat_priv) { return 1; } static inline void batadv_bla_free(struct batadv_priv *bat_priv) { } static inline int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { return -EOPNOTSUPP; } static inline int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { return -EOPNOTSUPP; } static inline bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { return true; } #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ |
163 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
155 156 155 158 158 157 9 46 4 5 19 17 1 455 458 456 456 36 4 29 27 455 455 458 11 5 6 11 1 3 7 7 98 100 100 100 14 11 99 28 27 28 1 28 28 34 1 2 2 1 30 30 30 29 1 30 2 8 15 5 12 8 19 1 28 6 65 7 34 22 19 150 1 1 2 84 9 9 61 107 21 29 18 118 117 113 5 118 118 2 10 2 8 129 44 33 13 81 6 129 129 39 90 116 99 34 64 13 4 5 72 118 18 146 5 11 16 11 12 13 2 10 1 11 10 15 1 1 6 3 4 7 7 5 2 6 1 9 5 1 1 1 1 2 1 2 279 273 5 1 1 4 40 37 6 22 1 2 2 18 4 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * RAW - implementation of IP "raw" sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : verify_area() fixed up * Alan Cox : ICMP error handling * Alan Cox : EMSGSIZE if you send too big a packet * Alan Cox : Now uses generic datagrams and shared * skbuff library. No more peek crashes, * no more backlogs * Alan Cox : Checks sk->broadcast. * Alan Cox : Uses skb_free_datagram/skb_copy_datagram * Alan Cox : Raw passes ip options too * Alan Cox : Setsocketopt added * Alan Cox : Fixed error return for broadcasts * Alan Cox : Removed wake_up calls * Alan Cox : Use ttl/tos * Alan Cox : Cleaned up old debugging * Alan Cox : Use new kernel side addresses * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. * Alan Cox : BSD style RAW socket demultiplexing. * Alan Cox : Beginnings of mrouted support. * Alan Cox : Added IP_HDRINCL option. * Alan Cox : Skip broadcast check if BSDism set. * David S. Miller : New socket lookup architecture. */ #include <linux/types.h> #include <linux/atomic.h> #include <asm/byteorder.h> #include <asm/current.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/sockios.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/mroute.h> #include <linux/netdevice.h> #include <linux/in_route.h> #include <linux/route.h> #include <linux/skbuff.h> #include <linux/igmp.h> #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <net/snmp.h> #include <net/tcp_states.h> #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/uio.h> struct raw_frag_vec { struct msghdr *msg; union { struct icmphdr icmph; char c[1]; } hdr; int hlen; }; struct raw_hashinfo raw_v4_hashinfo; EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); void raw_unhash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_unhash_sk); bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return true; return false; } EXPORT_SYMBOL_GPL(raw_v4_match); /* * 0 - deliver * 1 - block */ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmphdr _hdr; const struct icmphdr *hdr; hdr = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!hdr) return 1; if (hdr->type < 32) { __u32 data = raw_sk(sk)->filter.data; return ((1U << hdr->type) & data) != 0; } /* Do not block unknown ICMP types */ return 0; } /* IP input processing comes here for RAW socket delivery. * Caller owns SKB, so we must make clones. * * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { int sdif = inet_sdif(skb); struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); continue; } delivered = 1; if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) && ip_mc_sf_allow(sk, iph->daddr, iph->saddr, skb->dev->ifindex, sdif)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) raw_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } int raw_local_deliver(struct sk_buff *skb, int protocol) { struct net *net = dev_net(skb->dev); return raw_v4_input(net, skb, ip_hdr(skb), raw_hashfunc(net, protocol)); } static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; int harderr = 0; bool recverr; int err = 0; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_sk_update_pmtu(skb, sk, info); else if (type == ICMP_REDIRECT) { ipv4_sk_redirect(skb, sk); return; } /* Report error on raw socket, if: 1. User requested ip_recverr. 2. Socket is connected (otherwise the error indication is useless without ip_recverr and error is hard. */ recverr = inet_test_bit(RECVERR, sk); if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: return; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: err = EHOSTUNREACH; if (code > NR_ICMP_UNREACH) break; if (code == ICMP_FRAG_NEEDED) { harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT; err = EMSGSIZE; } else { err = icmp_err_convert[code].errno; harderr = icmp_err_convert[code].fatal; } } if (recverr) { const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet_test_bit(HDRINCL, sk)) payload = skb->data; ip_icmp_error(sk, skb, err, 0, info, payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; hash = raw_hashfunc(net, protocol); hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) continue; raw_err(sk, skb, info); } rcu_read_unlock(); } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; /* Charge it to the socket. */ ipv4_pktinfo_prepare(sk, skb, true); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } return NET_RX_SUCCESS; } int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); skb_push(skb, -skb_network_offset(skb)); raw_rcv_skb(sk, skb); return 0; } static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, struct rtable **rtp, unsigned int flags, const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; int err; struct rtable *rt = *rtp; int hlen, tlen; if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct iphdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; hlen = LL_RESERVED_SPACE(rt->dst.dev); tlen = rt->dst.dev->needed_tailroom; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, length); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) goto error_free; iphlen = iph->ihl * 4; /* * We don't want to modify the ip header, but we do need to * be sure that it won't cause problems later along the network * stack. Specifically we want to make sure that iph->ihl is a * sane value. If ihl points beyond the length of the buffer passed * in, reject the frame as invalid */ err = -EINVAL; if (iphlen > length) goto error_free; if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb->transport_header += iphlen; if (iph->protocol == IPPROTO_ICMP && length >= iphlen + sizeof(struct icmphdr)) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); } err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) goto error; out: return 0; error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; return err; } static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ rfv->hlen = 2; err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); if (err) return err; fl4->fl4_icmp_type = rfv->hdr.icmph.type; fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } static int raw_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->hdr.c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->hdr.c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; u8 scope; int free = 0; __be32 daddr; __be32 saddr; int uc_index, err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; hdrincl = inet_test_bit(HDRINCL, sk); /* * Check the flags. */ err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ goto out; /* compatibility */ /* * Get and verify the address. */ if (msg->msg_namelen) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; if (usin->sin_family != AF_INET) { pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", __func__, current->comm); err = -EAFNOSUPPORT; if (usin->sin_family) goto out; } daddr = usin->sin_addr.s_addr; /* ANK: I did not forget to get protocol from port field. * I just do not know, who uses this weirdness. * IP_HDRINCL is much more convenient. */ } else { err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; } ipcm_init_sk(&ipc, inet); /* Keep backward compat */ if (hdrincl) ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; } if (ipc.opt) free = 1; } saddr = ipc.addr; ipc.addr = daddr; if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (ipc.opt) { err = -EINVAL; /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) goto done; daddr = ipc.opt->opt.faddr; } } scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast * and uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); fl4.fl4_icmp_type = 0; fl4.fl4_icmp_code = 0; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto done; } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) goto done; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); else { if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); err = ip_append_data(sk, &fl4, raw_getfrag, &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) err = 0; } release_sock(sk); } done: if (free) kfree(ipc.opt); ip_rt_put(rt); out: if (err < 0) return err; return len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static void raw_close(struct sock *sk, long timeout) { /* * Raw sockets may have direct kernel references. Kill them. */ ip_ra_control(sk, 0, NULL); sk_common_release(sk); } static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); } /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; struct net *net = sock_net(sk); u32 tb_id = RT_TABLE_LOCAL; int ret = -EINVAL; int chk_addr_ret; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; if (sk->sk_bound_dev_if) tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); ret = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: release_sock(sk); return ret; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) { err = ip_recv_error(sk, msg, len, addr_len); goto out; } skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen) { if (optlen > sizeof(struct icmp_filter)) optlen = sizeof(struct icmp_filter); if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; } static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) { int len, ret = -EFAULT; if (get_user(len, optlen)) goto out; ret = -EINVAL; if (len < 0) goto out; if (len > sizeof(struct icmp_filter)) len = sizeof(struct icmp_filter); ret = -EFAULT; if (put_user(len, optlen) || copy_to_user(optval, &raw_sk(sk)->filter, len)) goto out; ret = 0; out: return ret; } static int do_raw_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); return do_raw_setsockopt(sk, optname, optval, optlen); } static int do_raw_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); } return -ENOPROTOOPT; } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); return do_raw_getsockopt(sk, optname, optval, optlen); } static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IP_MROUTE return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IP_MROUTE return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif int raw_abort(struct sock *sk, int err) { lock_sock(sk); sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(raw_abort); struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .ioctl = raw_ioctl, .init = raw_sk_init, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .bind = raw_bind, .backlog_rcv = raw_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw_sock), .useroffset = offsetof(struct raw_sock, filter), .usersize = sizeof_field(struct raw_sock, filter), .h.raw_hash = &raw_v4_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_raw_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } } return NULL; } static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { struct raw_iter_state *state = raw_seq_private(seq); do { sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) return raw_get_first(seq, state->bucket + 1); return sk; } static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = raw_get_first(seq, 0); if (sk) while (pos && (sk = raw_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = raw_get_first(seq, 0); else sk = raw_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr, src = inet->inet_rcv_saddr; __u16 destp = 0, srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int raw_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; } static const struct seq_operations raw_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw_seq_show, }; static __net_init int raw_init_net(struct net *net) { if (!proc_create_net_data("raw", 0444, net->proc_net, &raw_seq_ops, sizeof(struct raw_iter_state), &raw_v4_hashinfo)) return -ENOMEM; return 0; } static __net_exit void raw_exit_net(struct net *net) { remove_proc_entry("raw", net->proc_net); } static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, }; int __init raw_proc_init(void) { return register_pernet_subsys(&raw_net_ops); } void __init raw_proc_exit(void) { unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ static void raw_sysctl_init_net(struct net *net) { #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_raw_l3mdev_accept = 1; #endif } static int __net_init raw_sysctl_init(struct net *net) { raw_sysctl_init_net(net); return 0; } static struct pernet_operations __net_initdata raw_sysctl_ops = { .init = raw_sysctl_init, }; void __init raw_init(void) { raw_sysctl_init_net(&init_net); if (register_pernet_subsys(&raw_sysctl_ops)) panic("RAW: failed to init sysctl parameters.\n"); } |
1 6 1 2 2 1 46 46 45 45 45 30 44 38 2 3 34 11 1 3 14 4 6 2 1 2 1 1 1 4 2 1 6 1 1 2 1 1 1 3 1 4 3 1 1 3 2 2 2 1 2 1 1 1 3 1 1 2 2 8 4 2 1 2 11 35 46 46 46 45 45 46 46 46 46 45 46 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation * * element parsing for mac80211 */ #include <net/mac80211.h> #include <linux/netdevice.h> #include <linux/export.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/bitmap.h> #include <linux/crc32.h> #include <net/net_namespace.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <kunit/visibility.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "mesh.h" #include "wme.h" #include "led.h" #include "wep.h" struct ieee80211_elems_parse { /* must be first for kfree to work */ struct ieee802_11_elems elems; /* The basic Multi-Link element in the original elements */ const struct element *ml_basic_elem; /* The reconfiguration Multi-Link element in the original elements */ const struct element *ml_reconf_elem; /* The EPCS Multi-Link element in the original elements */ const struct element *ml_epcs_elem; /* * scratch buffer that can be used for various element parsing related * tasks, e.g., element de-fragmentation etc. */ size_t scratch_len; u8 *scratch_pos; u8 scratch[] __counted_by(scratch_len); }; static void ieee80211_parse_extension_element(u32 *crc, const struct element *elem, struct ieee80211_elems_parse *elems_parse, struct ieee80211_elems_parse_params *params) { struct ieee802_11_elems *elems = &elems_parse->elems; const void *data = elem->data + 1; bool calc_crc = false; u8 len; if (!elem->datalen) return; len = elem->datalen - 1; switch (elem->data[0]) { case WLAN_EID_EXT_HE_MU_EDCA: if (params->mode < IEEE80211_CONN_MODE_HE) break; calc_crc = true; if (len >= sizeof(*elems->mu_edca_param_set)) elems->mu_edca_param_set = data; break; case WLAN_EID_EXT_HE_CAPABILITY: if (params->mode < IEEE80211_CONN_MODE_HE) break; if (ieee80211_he_capa_size_ok(data, len)) { elems->he_cap = data; elems->he_cap_len = len; } break; case WLAN_EID_EXT_HE_OPERATION: if (params->mode < IEEE80211_CONN_MODE_HE) break; calc_crc = true; if (len >= sizeof(*elems->he_operation) && len >= ieee80211_he_oper_size(data) - 1) elems->he_operation = data; break; case WLAN_EID_EXT_UORA: if (params->mode < IEEE80211_CONN_MODE_HE) break; if (len >= 1) elems->uora_element = data; break; case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME: if (len == 3) elems->max_channel_switch_time = data; break; case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION: if (len >= sizeof(*elems->mbssid_config_ie)) elems->mbssid_config_ie = data; break; case WLAN_EID_EXT_HE_SPR: if (params->mode < IEEE80211_CONN_MODE_HE) break; if (len >= sizeof(*elems->he_spr) && len >= ieee80211_he_spr_size(data) - 1) elems->he_spr = data; break; case WLAN_EID_EXT_HE_6GHZ_CAPA: if (params->mode < IEEE80211_CONN_MODE_HE) break; if (len >= sizeof(*elems->he_6ghz_capa)) elems->he_6ghz_capa = data; break; case WLAN_EID_EXT_EHT_CAPABILITY: if (params->mode < IEEE80211_CONN_MODE_EHT) break; if (ieee80211_eht_capa_size_ok(elems->he_cap, data, len, params->from_ap)) { elems->eht_cap = data; elems->eht_cap_len = len; } break; case WLAN_EID_EXT_EHT_OPERATION: if (params->mode < IEEE80211_CONN_MODE_EHT) break; if (ieee80211_eht_oper_size_ok(data, len)) elems->eht_operation = data; calc_crc = true; break; case WLAN_EID_EXT_EHT_MULTI_LINK: if (params->mode < IEEE80211_CONN_MODE_EHT) break; calc_crc = true; if (ieee80211_mle_size_ok(data, len)) { const struct ieee80211_multi_link_elem *mle = (void *)data; switch (le16_get_bits(mle->control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: if (elems_parse->ml_basic_elem) { elems->parse_error |= IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC; break; } elems_parse->ml_basic_elem = elem; break; case IEEE80211_ML_CONTROL_TYPE_RECONF: elems_parse->ml_reconf_elem = elem; break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: elems_parse->ml_epcs_elem = elem; break; default: break; } } break; case WLAN_EID_EXT_BANDWIDTH_INDICATION: if (params->mode < IEEE80211_CONN_MODE_EHT) break; if (ieee80211_bandwidth_indication_size_ok(data, len)) elems->bandwidth_indication = data; calc_crc = true; break; case WLAN_EID_EXT_TID_TO_LINK_MAPPING: if (params->mode < IEEE80211_CONN_MODE_EHT) break; calc_crc = true; if (ieee80211_tid_to_link_map_size_ok(data, len) && elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) { elems->ttlm[elems->ttlm_num] = (void *)data; elems->ttlm_num++; } break; } if (crc && calc_crc) *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2); } static void ieee80211_parse_tpe(struct ieee80211_parsed_tpe *tpe, const u8 *data, u8 len) { const struct ieee80211_tx_pwr_env *env = (const void *)data; u8 count, interpret, category; u8 *out, N, *cnt_out = NULL, *N_out = NULL; if (!ieee80211_valid_tpe_element(data, len)) return; count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); switch (interpret) { case IEEE80211_TPE_LOCAL_EIRP: out = tpe->max_local[category].power; cnt_out = &tpe->max_local[category].count; tpe->max_local[category].valid = true; break; case IEEE80211_TPE_REG_CLIENT_EIRP: out = tpe->max_reg_client[category].power; cnt_out = &tpe->max_reg_client[category].count; tpe->max_reg_client[category].valid = true; break; case IEEE80211_TPE_LOCAL_EIRP_PSD: out = tpe->psd_local[category].power; cnt_out = &tpe->psd_local[category].count; N_out = &tpe->psd_local[category].n; tpe->psd_local[category].valid = true; break; case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: out = tpe->psd_reg_client[category].power; cnt_out = &tpe->psd_reg_client[category].count; N_out = &tpe->psd_reg_client[category].n; tpe->psd_reg_client[category].valid = true; break; } switch (interpret) { case IEEE80211_TPE_LOCAL_EIRP: case IEEE80211_TPE_REG_CLIENT_EIRP: /* count was validated <= 3, plus 320 MHz */ BUILD_BUG_ON(IEEE80211_TPE_EIRP_ENTRIES_320MHZ < 5); memcpy(out, env->variable, count + 1); *cnt_out = count + 1; /* separately take 320 MHz if present */ if (count == 3 && len > sizeof(*env) + count + 1) { out[4] = env->variable[4]; *cnt_out = 5; } break; case IEEE80211_TPE_LOCAL_EIRP_PSD: case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: if (!count) { memset(out, env->variable[0], IEEE80211_TPE_PSD_ENTRIES_320MHZ); *cnt_out = IEEE80211_TPE_PSD_ENTRIES_320MHZ; break; } N = 1 << (count - 1); memcpy(out, env->variable, N); *cnt_out = N; *N_out = N; if (len > sizeof(*env) + N) { int K = u8_get_bits(env->variable[N], IEEE80211_TX_PWR_ENV_EXT_COUNT); K = min(K, IEEE80211_TPE_PSD_ENTRIES_320MHZ - N); memcpy(out + N, env->variable + N + 1, K); (*cnt_out) += K; } break; } } static u32 _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, struct ieee80211_elems_parse *elems_parse, const struct element *check_inherit) { struct ieee802_11_elems *elems = &elems_parse->elems; const struct element *elem; bool calc_crc = params->filter != 0; DECLARE_BITMAP(seen_elems, 256); u32 crc = params->crc; bitmap_zero(seen_elems, 256); for_each_element(elem, params->start, params->len) { const struct element *subelem; u8 elem_parse_failed; u8 id = elem->id; u8 elen = elem->datalen; const u8 *pos = elem->data; if (check_inherit && !cfg80211_is_element_inherited(elem, check_inherit)) continue; switch (id) { case WLAN_EID_SSID: case WLAN_EID_SUPP_RATES: case WLAN_EID_FH_PARAMS: case WLAN_EID_DS_PARAMS: case WLAN_EID_CF_PARAMS: case WLAN_EID_TIM: case WLAN_EID_IBSS_PARAMS: case WLAN_EID_CHALLENGE: case WLAN_EID_RSN: case WLAN_EID_ERP_INFO: case WLAN_EID_EXT_SUPP_RATES: case WLAN_EID_HT_CAPABILITY: case WLAN_EID_HT_OPERATION: case WLAN_EID_VHT_CAPABILITY: case WLAN_EID_VHT_OPERATION: case WLAN_EID_MESH_ID: case WLAN_EID_MESH_CONFIG: case WLAN_EID_PEER_MGMT: case WLAN_EID_PREQ: case WLAN_EID_PREP: case WLAN_EID_PERR: case WLAN_EID_RANN: case WLAN_EID_CHANNEL_SWITCH: case WLAN_EID_EXT_CHANSWITCH_ANN: case WLAN_EID_COUNTRY: case WLAN_EID_PWR_CONSTRAINT: case WLAN_EID_TIMEOUT_INTERVAL: case WLAN_EID_SECONDARY_CHANNEL_OFFSET: case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: case WLAN_EID_CHAN_SWITCH_PARAM: case WLAN_EID_EXT_CAPABILITY: case WLAN_EID_CHAN_SWITCH_TIMING: case WLAN_EID_LINK_ID: case WLAN_EID_BSS_MAX_IDLE_PERIOD: case WLAN_EID_RSNX: case WLAN_EID_S1G_BCN_COMPAT: case WLAN_EID_S1G_CAPABILITIES: case WLAN_EID_S1G_OPERATION: case WLAN_EID_AID_RESPONSE: case WLAN_EID_S1G_SHORT_BCN_INTERVAL: /* * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible * that if the content gets bigger it might be needed more than once */ if (test_bit(id, seen_elems)) { elems->parse_error |= IEEE80211_PARSE_ERR_DUP_ELEM; continue; } break; } if (calc_crc && id < 64 && (params->filter & (1ULL << id))) crc = crc32_be(crc, pos - 2, elen + 2); elem_parse_failed = 0; switch (id) { case WLAN_EID_LINK_ID: if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->lnk_id = (void *)(pos - 2); break; case WLAN_EID_CHAN_SWITCH_TIMING: if (elen < sizeof(struct ieee80211_ch_switch_timing)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->ch_sw_timing = (void *)pos; break; case WLAN_EID_EXT_CAPABILITY: elems->ext_capab = pos; elems->ext_capab_len = elen; break; case WLAN_EID_SSID: elems->ssid = pos; elems->ssid_len = elen; break; case WLAN_EID_SUPP_RATES: elems->supp_rates = pos; elems->supp_rates_len = elen; break; case WLAN_EID_DS_PARAMS: if (elen >= 1) elems->ds_params = pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_TIM: if (elen >= sizeof(struct ieee80211_tim_ie)) { elems->tim = (void *)pos; elems->tim_len = elen; } else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_VENDOR_SPECIFIC: if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { /* Microsoft OUI (00:50:F2) */ if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); if (elen >= 5 && pos[3] == 2) { /* OUI Type 2 - WMM IE */ if (pos[4] == 0) { elems->wmm_info = pos; elems->wmm_info_len = elen; } else if (pos[4] == 1) { elems->wmm_param = pos; elems->wmm_param_len = elen; } } } break; case WLAN_EID_RSN: elems->rsn = pos; elems->rsn_len = elen; break; case WLAN_EID_ERP_INFO: if (elen >= 1) elems->erp_info = pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_EXT_SUPP_RATES: elems->ext_supp_rates = pos; elems->ext_supp_rates_len = elen; break; case WLAN_EID_HT_CAPABILITY: if (params->mode < IEEE80211_CONN_MODE_HT) break; if (elen >= sizeof(struct ieee80211_ht_cap)) elems->ht_cap_elem = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_HT_OPERATION: if (params->mode < IEEE80211_CONN_MODE_HT) break; if (elen >= sizeof(struct ieee80211_ht_operation)) elems->ht_operation = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_VHT_CAPABILITY: if (params->mode < IEEE80211_CONN_MODE_VHT) break; if (elen >= sizeof(struct ieee80211_vht_cap)) elems->vht_cap_elem = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_VHT_OPERATION: if (params->mode < IEEE80211_CONN_MODE_VHT) break; if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); break; } elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_OPMODE_NOTIF: if (params->mode < IEEE80211_CONN_MODE_VHT) break; if (elen > 0) { elems->opmode_notif = pos; if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); break; } elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; elems->mesh_id_len = elen; break; case WLAN_EID_MESH_CONFIG: if (elen >= sizeof(struct ieee80211_meshconf_ie)) elems->mesh_config = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_PEER_MGMT: elems->peering = pos; elems->peering_len = elen; break; case WLAN_EID_MESH_AWAKE_WINDOW: if (elen >= 2) elems->awake_window = (void *)pos; break; case WLAN_EID_PREQ: elems->preq = pos; elems->preq_len = elen; break; case WLAN_EID_PREP: elems->prep = pos; elems->prep_len = elen; break; case WLAN_EID_PERR: elems->perr = pos; elems->perr_len = elen; break; case WLAN_EID_RANN: if (elen >= sizeof(struct ieee80211_rann_ie)) elems->rann = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_CHANNEL_SWITCH: if (elen != sizeof(struct ieee80211_channel_sw_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->ch_switch_ie = (void *)pos; break; case WLAN_EID_EXT_CHANSWITCH_ANN: if (elen != sizeof(struct ieee80211_ext_chansw_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->ext_chansw_ie = (void *)pos; break; case WLAN_EID_SECONDARY_CHANNEL_OFFSET: if (params->mode < IEEE80211_CONN_MODE_HT) break; if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->sec_chan_offs = (void *)pos; break; case WLAN_EID_CHAN_SWITCH_PARAM: if (elen < sizeof(*elems->mesh_chansw_params_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->mesh_chansw_params_ie = (void *)pos; break; case WLAN_EID_WIDE_BW_CHANNEL_SWITCH: if (params->mode < IEEE80211_CONN_MODE_VHT) break; if (!params->action) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; } if (elen < sizeof(*elems->wide_bw_chansw_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->wide_bw_chansw_ie = (void *)pos; break; case WLAN_EID_CHANNEL_SWITCH_WRAPPER: if (params->mode < IEEE80211_CONN_MODE_VHT) break; if (params->action) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; } /* * This is a bit tricky, but as we only care about * a few elements, parse them out manually. */ subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH, pos, elen); if (subelem) { if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie)) elems->wide_bw_chansw_ie = (void *)subelem->data; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; } if (params->mode < IEEE80211_CONN_MODE_EHT) break; subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION, pos, elen); if (subelem) { const void *edata = subelem->data + 1; u8 edatalen = subelem->datalen - 1; if (ieee80211_bandwidth_indication_size_ok(edata, edatalen)) elems->bandwidth_indication = edata; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; } subelem = cfg80211_find_ext_elem(WLAN_EID_TX_POWER_ENVELOPE, pos, elen); if (subelem) ieee80211_parse_tpe(&elems->csa_tpe, subelem->data + 1, subelem->datalen - 1); break; case WLAN_EID_COUNTRY: elems->country_elem = pos; elems->country_elem_len = elen; break; case WLAN_EID_PWR_CONSTRAINT: if (elen != 1) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->pwr_constr_elem = pos; break; case WLAN_EID_CISCO_VENDOR_SPECIFIC: /* Lots of different options exist, but we only care * about the Dynamic Transmit Power Control element. * First check for the Cisco OUI, then for the DTPC * tag (0x00). */ if (elen < 4) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } if (pos[0] != 0x00 || pos[1] != 0x40 || pos[2] != 0x96 || pos[3] != 0x00) break; if (elen != 6) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } if (calc_crc) crc = crc32_be(crc, pos - 2, elen + 2); elems->cisco_dtpc_elem = pos; break; case WLAN_EID_ADDBA_EXT: if (elen < sizeof(struct ieee80211_addba_ext_ie)) { elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; } elems->addba_ext_ie = (void *)pos; break; case WLAN_EID_TIMEOUT_INTERVAL: if (elen >= sizeof(struct ieee80211_timeout_interval_ie)) elems->timeout_int = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_BSS_MAX_IDLE_PERIOD: if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; case WLAN_EID_RSNX: elems->rsnx = pos; elems->rsnx_len = elen; break; case WLAN_EID_TX_POWER_ENVELOPE: if (params->mode < IEEE80211_CONN_MODE_HE) break; ieee80211_parse_tpe(&elems->tpe, pos, elen); break; case WLAN_EID_EXTENSION: ieee80211_parse_extension_element(calc_crc ? &crc : NULL, elem, elems_parse, params); break; case WLAN_EID_S1G_CAPABILITIES: if (params->mode != IEEE80211_CONN_MODE_S1G) break; if (elen >= sizeof(*elems->s1g_capab)) elems->s1g_capab = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_S1G_OPERATION: if (params->mode != IEEE80211_CONN_MODE_S1G) break; if (elen == sizeof(*elems->s1g_oper)) elems->s1g_oper = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_S1G_BCN_COMPAT: if (params->mode != IEEE80211_CONN_MODE_S1G) break; if (elen == sizeof(*elems->s1g_bcn_compat)) elems->s1g_bcn_compat = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_AID_RESPONSE: if (params->mode != IEEE80211_CONN_MODE_S1G) break; if (elen == sizeof(struct ieee80211_aid_response_ie)) elems->aid_resp = (void *)pos; else elem_parse_failed = IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; default: break; } if (elem_parse_failed) elems->parse_error |= elem_parse_failed; else __set_bit(id, seen_elems); } if (!for_each_element_completed(elem, params->start, params->len)) elems->parse_error |= IEEE80211_PARSE_ERR_INVALID_END; return crc; } static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, struct ieee802_11_elems *elems, struct cfg80211_bss *bss, u8 *nontransmitted_profile) { const struct element *elem, *sub; size_t profile_len = 0; bool found = false; if (!bss || !bss->transmitted_bss) return profile_len; for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { if (elem->datalen < 2) continue; if (elem->data[0] < 1 || elem->data[0] > 8) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 new_bssid[ETH_ALEN]; const u8 *index; if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; } if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || sub->data[1] != 2) { /* The first element of the * Nontransmitted BSSID Profile is not * the Nontransmitted BSSID Capability * element. */ continue; } memset(nontransmitted_profile, 0, len); profile_len = cfg80211_merge_profile(start, len, elem, sub, nontransmitted_profile, len); /* found a Nontransmitted BSSID Profile */ index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, nontransmitted_profile, profile_len); if (!index || index[1] < 1 || index[2] == 0) { /* Invalid MBSSID Index element */ continue; } cfg80211_gen_new_bssid(bss->transmitted_bss->bssid, elem->data[0], index[2], new_bssid); if (ether_addr_equal(new_bssid, bss->bssid)) { found = true; elems->bssid_index_len = index[1]; elems->bssid_index = (void *)&index[2]; break; } } } return found ? profile_len : 0; } static void ieee80211_mle_get_sta_prof(struct ieee80211_elems_parse *elems_parse, u8 link_id) { struct ieee802_11_elems *elems = &elems_parse->elems; const struct ieee80211_multi_link_elem *ml = elems->ml_basic; ssize_t ml_len = elems->ml_basic_len; const struct element *sub; for_each_mle_subelement(sub, (u8 *)ml, ml_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; ssize_t sta_prof_len; u16 control; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, sub->datalen)) return; control = le16_to_cpu(prof->control); if (link_id != u16_get_bits(control, IEEE80211_MLE_STA_CONTROL_LINK_ID)) continue; if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) return; /* the sub element can be fragmented */ sta_prof_len = cfg80211_defragment_element(sub, (u8 *)ml, ml_len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, IEEE80211_MLE_SUBELEM_FRAGMENT); if (sta_prof_len < 0) return; elems->prof = (void *)elems_parse->scratch_pos; elems->sta_prof_len = sta_prof_len; elems_parse->scratch_pos += sta_prof_len; return; } } static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, struct ieee80211_elems_parse_params *params) { struct ieee802_11_elems *elems = &elems_parse->elems; struct ieee80211_mle_per_sta_profile *prof; struct ieee80211_elems_parse_params sub = { .mode = params->mode, .action = params->action, .from_ap = params->from_ap, .link_id = -1, }; ssize_t ml_len = elems->ml_basic_len; const struct element *non_inherit = NULL; const u8 *end; ml_len = cfg80211_defragment_element(elems_parse->ml_basic_elem, elems->ie_start, elems->total_len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, WLAN_EID_FRAGMENT); if (ml_len < 0) return; elems->ml_basic = (const void *)elems_parse->scratch_pos; elems->ml_basic_len = ml_len; elems_parse->scratch_pos += ml_len; if (params->link_id == -1) return; ieee80211_mle_get_sta_prof(elems_parse, params->link_id); prof = elems->prof; if (!prof) return; /* check if we have the 4 bytes for the fixed part in assoc response */ if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { elems->prof = NULL; elems->sta_prof_len = 0; return; } /* * Skip the capability information and the status code that are expected * as part of the station profile in association response frames. Note * the -1 is because the 'sta_info_len' is accounted to as part of the * per-STA profile, but not part of the 'u8 variable[]' portion. */ sub.start = prof->variable + prof->sta_info_len - 1 + 4; end = (const u8 *)prof + elems->sta_prof_len; sub.len = end - sub.start; non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, sub.start, sub.len); _ieee802_11_parse_elems_full(&sub, elems_parse, non_inherit); } static void ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse) { struct ieee802_11_elems *elems = &elems_parse->elems; ssize_t ml_len; ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem, elems->ie_start, elems->total_len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, WLAN_EID_FRAGMENT); if (ml_len < 0) return; elems->ml_reconf = (void *)elems_parse->scratch_pos; elems->ml_reconf_len = ml_len; elems_parse->scratch_pos += ml_len; } static void ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse) { struct ieee802_11_elems *elems = &elems_parse->elems; ssize_t ml_len; ml_len = cfg80211_defragment_element(elems_parse->ml_epcs_elem, elems->ie_start, elems->total_len, elems_parse->scratch_pos, elems_parse->scratch + elems_parse->scratch_len - elems_parse->scratch_pos, WLAN_EID_FRAGMENT); if (ml_len < 0) return; elems->ml_epcs = (void *)elems_parse->scratch_pos; elems->ml_epcs_len = ml_len; elems_parse->scratch_pos += ml_len; } struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) { struct ieee80211_elems_parse *elems_parse; struct ieee802_11_elems *elems; const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; size_t scratch_len = 3 * params->len; BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len), GFP_ATOMIC); if (!elems_parse) return NULL; elems_parse->scratch_len = scratch_len; elems_parse->scratch_pos = elems_parse->scratch; elems = &elems_parse->elems; elems->ie_start = params->start; elems->total_len = params->len; /* set all TPE entries to unlimited (but invalid) */ ieee80211_clear_tpe(&elems->tpe); ieee80211_clear_tpe(&elems->csa_tpe); nontransmitted_profile = elems_parse->scratch_pos; nontransmitted_profile_len = ieee802_11_find_bssid_profile(params->start, params->len, elems, params->bss, nontransmitted_profile); elems_parse->scratch_pos += nontransmitted_profile_len; non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, nontransmitted_profile, nontransmitted_profile_len); elems->crc = _ieee802_11_parse_elems_full(params, elems_parse, non_inherit); /* Override with nontransmitted profile, if found */ if (nontransmitted_profile_len) { struct ieee80211_elems_parse_params sub = { .mode = params->mode, .start = nontransmitted_profile, .len = nontransmitted_profile_len, .action = params->action, .link_id = params->link_id, }; _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); } ieee80211_mle_parse_link(elems_parse, params); ieee80211_mle_defrag_reconf(elems_parse); ieee80211_mle_defrag_epcs(elems_parse); if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; elems->dtim_period = tim_ie->dtim_period; elems->dtim_count = tim_ie->dtim_count; } /* Override DTIM period and count if needed */ if (elems->bssid_index && elems->bssid_index_len >= offsetofend(struct ieee80211_bssid_index, dtim_period)) elems->dtim_period = elems->bssid_index->dtim_period; if (elems->bssid_index && elems->bssid_index_len >= offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; return elems; } EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full); int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { u32 rate_flags = ieee80211_chanwidth_rate_flags(width); struct ieee80211_rate *br; int brate, rate, i, j, count = 0; *rates = 0; for (i = 0; i < srates_len; i++) { rate = srates[i] & 0x7f; for (j = 0; j < sband->n_bitrates; j++) { br = &sband->bitrates[j]; if ((rate_flags & br->flags) != rate_flags) continue; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { *rates |= BIT(j); count++; break; } } } return count; } |
2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * DES & Triple DES EDE Cipher Algorithms. * * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ #include <asm/byteorder.h> #include <crypto/algapi.h> #include <linux/bitops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/errno.h> #include <crypto/internal/des.h> static int des_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des_ctx *dctx = crypto_tfm_ctx(tfm); int err; err = des_expand_key(dctx, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) memset(dctx, 0, sizeof(*dctx)); return err; } static void crypto_des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des_ctx *dctx = crypto_tfm_ctx(tfm); des_encrypt(dctx, dst, src); } static void crypto_des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des_ctx *dctx = crypto_tfm_ctx(tfm); des_decrypt(dctx, dst, src); } static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); int err; err = des3_ede_expand_key(dctx, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) memset(dctx, 0, sizeof(*dctx)); return err; } static void crypto_des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); des3_ede_encrypt(dctx, dst, src); } static void crypto_des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm); des3_ede_decrypt(dctx, dst, src); } static struct crypto_alg des_algs[2] = { { .cra_name = "des", .cra_driver_name = "des-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES_KEY_SIZE, .cia_max_keysize = DES_KEY_SIZE, .cia_setkey = des_setkey, .cia_encrypt = crypto_des_encrypt, .cia_decrypt = crypto_des_decrypt } } }, { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES3_EDE_KEY_SIZE, .cia_max_keysize = DES3_EDE_KEY_SIZE, .cia_setkey = des3_ede_setkey, .cia_encrypt = crypto_des3_ede_encrypt, .cia_decrypt = crypto_des3_ede_decrypt } } } }; static int __init des_generic_mod_init(void) { return crypto_register_algs(des_algs, ARRAY_SIZE(des_algs)); } static void __exit des_generic_mod_fini(void) { crypto_unregister_algs(des_algs, ARRAY_SIZE(des_algs)); } subsys_initcall(des_generic_mod_init); module_exit(des_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_AUTHOR("Dag Arne Osvik <da@osvik.no>"); MODULE_ALIAS_CRYPTO("des"); MODULE_ALIAS_CRYPTO("des-generic"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-generic"); |
2046 2 14 1 13 14 14 14 11 7 4 14 13 1 14 14 13 2 2 18 18 2 40 40 2025 2031 2022 2026 1738 1733 1738 1738 1732 1687 719 690 313 2 58 310 313 58 314 343 705 705 705 704 10 10 10 6 4 1952 1 2044 1928 1927 704 679 612 703 125 9 118 1710 1712 1718 1715 1710 1714 1710 1712 1716 7 1713 65 1658 1860 1860 1833 1815 1836 6 5 1 1668 1669 5 6 6 1650 1650 93 29 92 91 92 102 102 62 60 59 59 1 58 58 1967 1906 763 1971 1971 1968 134 134 1712 1719 1717 1711 1715 68 1721 705 702 703 704 703 703 705 705 704 705 703 703 703 652 654 129 129 128 57 79 786 641 702 40 40 39 2 38 40 40 2 38 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) return strscpy(buf, "(null)", buflen); return strscpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (to->parent && to != from) { depth++; to = to->parent; } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = a->parent; da--; } while (db > da) { b = b->parent; db--; } /* worst case b and a will be the same at root */ while (b != a) { b = b->parent; a = a->parent; } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kn->name); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, left->name, left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { if (RB_EMPTY_NODE(&kn->rb)) return false; down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); kfree_const(kn->name); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; down_write(&root->kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, kn->name)) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { struct kernfs_node *parent; /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kn->name) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (kn->parent && kernfs_ns_enabled(kn->parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return -EPERM; if (!kernfs_get_active(parent)) return -ENODEV; ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return pos->parent; } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn); } up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kn->parent && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kn->name); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!pos->parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); schedule(); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name = NULL; int error; /* can't move or rename root */ if (!kn->parent) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; if ((kn->parent == new_parent) && (kn->ns == new_ns) && (strcmp(kn->name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(kn->name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; kn->ns = new_ns; if (new_name) { old_name = kn->name; kn->name = new_name; } write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); kernfs_put(old_parent); kfree_const(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; down_read(&root->kernfs_rwsem); } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, }; |
40 40 26 33 34 34 34 7 25 17 9 6 4 10 10 29 29 8 21 7 28 37 37 37 37 1 26 14 115 115 111 4 4 4 8 5 2 1 48 21 11 57 57 57 4 54 4 56 1 49 18 13 4 1 33 10 12 45 29 29 29 31 4 4 1 3 3 1 2 5 1 1 3 1 2 1 10 10 10 4 7 2 4 2 71 1 1 69 11 34 35 2 7 33 7 30 2 5 4 2 4 4 3 3 4 11 7 21 20 84 1 84 1 5 10 71 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(np, sfl) \ for (sfl = rcu_dereference(np->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(np->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; rcu_assign_pointer(np->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; inet6_set_bit(REPFLOW, sk); return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(np, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(np, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } |
1 1 1 1 1 1 1 1 1 6 7 7 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Syncookies implementation for the Linux kernel * * Copyright (C) 1997 Andi Kleen * Based on ideas by D.J.Bernstein and Eric Schenk. */ #include <linux/tcp.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK * stores TCP options: * * MSB LSB * | 31 ... 6 | 5 | 4 | 3 2 1 0 | * | Timestamp | ECN | SACK | WScale | * * When we receive a valid cookie-ACK, we look at the echoed tsval (if * any) to figure out which TCP options we should use for the rebuilt * connection. * * A WScale setting of '0xf' (which is an invalid scaling value) * means that original syn did not include the TCP window scaling option. */ #define TS_OPT_WSCALE_MASK 0xf #define TS_OPT_SACK BIT(4) #define TS_OPT_ECN BIT(5) /* There is no TS_OPT_TIMESTAMP: * if ACK contains timestamp option, we already know it was * requested/supported by the syn/synack exchange. */ #define TSBITS 6 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); return siphash_4u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, count, &syncookie_secret[c]); } /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be * sent in the syn-ack. * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. */ u64 cookie_init_timestamp(struct request_sock *req, u64 now) { const struct inet_request_sock *ireq = inet_rsk(req); u64 ts, ts_now = tcp_ns_to_ts(false, now); u32 options = 0; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; if (ireq->sack_ok) options |= TS_OPT_SACK; if (ireq->ecn_ok) options |= TS_OPT_ECN; ts = (ts_now >> TSBITS) << TSBITS; ts |= options; if (ts > ts_now) ts -= (1UL << TSBITS); if (tcp_rsk(req)->req_usec_ts) return ts * NSEC_PER_USEC; return ts * NSEC_PER_MSEC; } static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. * The output should be: * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). * Where sseq is their sequence number and count increases every * minute by 1. * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } /* * This retrieves the small "data" value from the syncookie. * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * * The count value used to generate the cookie must be less than * MAX_SYNCOOKIE_AGE minutes in the past. * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, __be16 sport, __be16 dport, __u32 sseq) { u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; /* Leaving the data behind */ } /* * MSS Values are chosen based on the 2011 paper * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. * Values .. * .. lower than 536 are rare (< 0.2%) * .. between 537 and 1299 account for less than < 1.5% of observed values * .. in the 1300-1349 range account for about 15 to 20% of observed mss values * .. exceeding 1460 are very rare (< 0.04%) * * 1460 is the single most frequently announced mss value (30 to 46% depending * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { 536, 1300, 1440, /* 1440, 1452: PPPoE */ 1460, }; /* * Generate a syncookie. mssp points to the mss, which is returned * rounded down to the value encoded in the cookie. */ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v4_init_sequence(iph, th, mssp); } /* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; bool own_req; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, NULL, &own_req); if (child) { refcount_set(&req->rsk_refcnt, 1); sock_rps_save_rxhash(child, skb); if (rsk_drop_req(req)) { reqsk_put(req); return child; } if (inet_csk_reqsk_queue_add(sk, req, child)) return child; bh_unlock_sock(child); sock_put(child); } __reqsk_free(req); return NULL; } EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * * return false if we decode a tcp option that is disabled * on the host. */ bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); return true; } if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); const struct tcphdr *th = tcp_hdr(skb); req->num_retrans = 0; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; ireq->ir_iif = inet_request_bound_dev_if(sk, skb); ireq->ir_mark = inet_request_mark(sk, skb); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; treq->snt_synack = 0; treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = ntohl(th->ack_seq) - 1; treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; treq->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) treq->is_mptcp = sk_is_mptcp(sk); if (treq->is_mptcp) return mptcp_subflow_init_cookie_req(req, sk, skb); #endif return 0; } #if IS_ENABLED(CONFIG_BPF) struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) { struct request_sock *req = inet_reqsk(skb->sk); skb->sk = NULL; skb->destructor = NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); req = NULL; } return req; } EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff) { struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; if (sk_is_mptcp(sk)) req = mptcp_subflow_reqsk_alloc(ops, sk, false); else req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; if (cookie_tcp_reqsk_init(sk, skb, req)) { reqsk_free(req); return NULL; } ireq = inet_rsk(req); treq = tcp_rsk(req); req->mss = mss; req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; ireq->snd_wscale = tcp_opt->snd_wscale; ireq->tstamp_ok = tcp_opt->saw_tstamp; ireq->sack_ok = tcp_opt->sack_ok; ireq->wscale_ok = tcp_opt->wscale_ok; ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); treq->ts_off = tsoff; return req; } EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } /* On input, sk is a listener. * Output is listener if incoming packet would not create a child * NULL if memory could not be allocated. */ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; struct rtable *rt; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) */ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } tcp_ao_syncookie(sk, skb, req, AF_INET); /* * We need to lookup the route here to get at the correct * window size. We should better make sure that the window size * hasn't changed since we received the original syn, but I see * no easy way to do this. */ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? : dst_metric(&rt->dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; } |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/tcp_states.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static void ax25_ds_timeout(struct timer_list *); /* * Add DAMA slave timeout timer to timer list. * Unlike the connection based timers the timeout function gets * triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT * (aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in * 1/10th of a second. */ void ax25_ds_setup_timer(ax25_dev *ax25_dev) { timer_setup(&ax25_dev->dama.slave_timer, ax25_ds_timeout, 0); } void ax25_ds_del_timer(ax25_dev *ax25_dev) { if (ax25_dev) del_timer(&ax25_dev->dama.slave_timer); } void ax25_ds_set_timer(ax25_dev *ax25_dev) { if (ax25_dev == NULL) /* paranoia */ return; ax25_dev->dama.slave_timeout = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10; mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ); } /* * DAMA Slave Timeout * Silently discard all (slave) connections in case our master forgot us... */ static void ax25_ds_timeout(struct timer_list *t) { ax25_dev *ax25_dev = from_timer(ax25_dev, t, dama.slave_timer); ax25_cb *ax25; if (ax25_dev == NULL || !ax25_dev->dama.slave) return; /* Yikes! */ if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) { ax25_ds_set_timer(ax25_dev); return; } spin_lock(&ax25_list_lock); ax25_for_each(ax25, &ax25_list) { if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE)) continue; ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); ax25_disconnect(ax25, ETIMEDOUT); } spin_unlock(&ax25_list_lock); ax25_dev_dama_off(ax25_dev); } void ax25_ds_heartbeat_expiry(ax25_cb *ax25) { struct sock *sk=ax25->sk; if (sk) bh_lock_sock(sk); switch (ax25->state) { case AX25_STATE_0: case AX25_STATE_2: /* Magic here: If we listen() and a new link dies before it is accepted() it isn't 'dead' so doesn't get removed. */ if (!sk || sock_flag(sk, SOCK_DESTROY) || (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { if (sk) { sock_hold(sk); ax25_destroy_socket(ax25); bh_unlock_sock(sk); /* Ungrab socket and destroy it */ sock_put(sk); } else ax25_destroy_socket(ax25); return; } break; case AX25_STATE_3: /* * Check the state of the receive buffer. */ if (sk != NULL) { if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) && (ax25->condition & AX25_COND_OWN_RX_BUSY)) { ax25->condition &= ~AX25_COND_OWN_RX_BUSY; ax25->condition &= ~AX25_COND_ACK_PENDING; break; } } break; } if (sk) bh_unlock_sock(sk); ax25_start_heartbeat(ax25); } /* dl1bke 960114: T3 works much like the IDLE timeout, but * gets reloaded with every frame for this * connection. */ void ax25_ds_t3timer_expiry(ax25_cb *ax25) { ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); ax25_dama_off(ax25); ax25_disconnect(ax25, ETIMEDOUT); } /* dl1bke 960228: close the connection when IDLE expires. * unlike T3 this timer gets reloaded only on * I frames. */ void ax25_ds_idletimer_expiry(ax25_cb *ax25) { ax25_clear_queues(ax25); ax25->n2count = 0; ax25->state = AX25_STATE_2; ax25_calculate_t1(ax25); ax25_start_t1timer(ax25); ax25_stop_t3timer(ax25); if (ax25->sk != NULL) { bh_lock_sock(ax25->sk); ax25->sk->sk_state = TCP_CLOSE; ax25->sk->sk_err = 0; ax25->sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(ax25->sk, SOCK_DEAD)) { ax25->sk->sk_state_change(ax25->sk); sock_set_flag(ax25->sk, SOCK_DEAD); } bh_unlock_sock(ax25->sk); } } /* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC * within the poll of any connected channel. Remember * that we are not allowed to send anything unless we * get polled by the Master. * * Thus we'll have to do parts of our T1 handling in * ax25_enquiry_response(). */ void ax25_ds_t1_timeout(ax25_cb *ax25) { switch (ax25->state) { case AX25_STATE_1: if (ax25->n2count == ax25->n2) { if (ax25->modulus == AX25_MODULUS) { ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->modulus = AX25_MODULUS; ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; ax25->n2count = 0; ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); } } else { ax25->n2count++; if (ax25->modulus == AX25_MODULUS) ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); else ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND); } break; case AX25_STATE_2: if (ax25->n2count == ax25->n2) { ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); if (!sock_flag(ax25->sk, SOCK_DESTROY)) ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->n2count++; } break; case AX25_STATE_3: if (ax25->n2count == ax25->n2) { ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); ax25_disconnect(ax25, ETIMEDOUT); return; } else { ax25->n2count++; } break; } ax25_calculate_t1(ax25); ax25_start_t1timer(ax25); } |
13 13 13 12 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate); |
4 4 4 2 4 4 4 4 2 6 6 2 1 4 4 1 2 2 2 2 2 7 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/team/team_mode_loadbalance.c - Load-balancing mode for team * Copyright (c) 2012 Jiri Pirko <jpirko@redhat.com> */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/if_team.h> static rx_handler_result_t lb_receive(struct team *team, struct team_port *port, struct sk_buff *skb) { if (unlikely(skb->protocol == htons(ETH_P_SLOW))) { /* LACPDU packets should go to exact delivery */ const unsigned char *dest = eth_hdr(skb)->h_dest; if (is_link_local_ether_addr(dest) && dest[5] == 0x02) return RX_HANDLER_EXACT; } return RX_HANDLER_ANOTHER; } struct lb_priv; typedef struct team_port *lb_select_tx_port_func_t(struct team *, unsigned char); #define LB_TX_HASHTABLE_SIZE 256 /* hash is a char */ struct lb_stats { u64 tx_bytes; }; struct lb_pcpu_stats { struct lb_stats hash_stats[LB_TX_HASHTABLE_SIZE]; struct u64_stats_sync syncp; }; struct lb_stats_info { struct lb_stats stats; struct lb_stats last_stats; struct team_option_inst_info *opt_inst_info; }; struct lb_port_mapping { struct team_port __rcu *port; struct team_option_inst_info *opt_inst_info; }; struct lb_priv_ex { struct team *team; struct lb_port_mapping tx_hash_to_port_mapping[LB_TX_HASHTABLE_SIZE]; struct sock_fprog_kern *orig_fprog; struct { unsigned int refresh_interval; /* in tenths of second */ struct delayed_work refresh_dw; struct lb_stats_info info[LB_TX_HASHTABLE_SIZE]; } stats; }; struct lb_priv { struct bpf_prog __rcu *fp; lb_select_tx_port_func_t __rcu *select_tx_port_func; struct lb_pcpu_stats __percpu *pcpu_stats; struct lb_priv_ex *ex; /* priv extension */ }; static struct lb_priv *get_lb_priv(struct team *team) { return (struct lb_priv *) &team->mode_priv; } struct lb_port_priv { struct lb_stats __percpu *pcpu_stats; struct lb_stats_info stats_info; }; static struct lb_port_priv *get_lb_port_priv(struct team_port *port) { return (struct lb_port_priv *) &port->mode_priv; } #define LB_HTPM_PORT_BY_HASH(lp_priv, hash) \ (lb_priv)->ex->tx_hash_to_port_mapping[hash].port #define LB_HTPM_OPT_INST_INFO_BY_HASH(lp_priv, hash) \ (lb_priv)->ex->tx_hash_to_port_mapping[hash].opt_inst_info static void lb_tx_hash_to_port_mapping_null_port(struct team *team, struct team_port *port) { struct lb_priv *lb_priv = get_lb_priv(team); bool changed = false; int i; for (i = 0; i < LB_TX_HASHTABLE_SIZE; i++) { struct lb_port_mapping *pm; pm = &lb_priv->ex->tx_hash_to_port_mapping[i]; if (rcu_access_pointer(pm->port) == port) { RCU_INIT_POINTER(pm->port, NULL); team_option_inst_set_change(pm->opt_inst_info); changed = true; } } if (changed) team_options_change_check(team); } /* Basic tx selection based solely by hash */ static struct team_port *lb_hash_select_tx_port(struct team *team, unsigned char hash) { int port_index = team_num_to_port_index(team, hash); return team_get_port_by_index_rcu(team, port_index); } /* Hash to port mapping select tx port */ static struct team_port *lb_htpm_select_tx_port(struct team *team, unsigned char hash) { struct lb_priv *lb_priv = get_lb_priv(team); struct team_port *port; port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash)); if (likely(port)) return port; /* If no valid port in the table, fall back to simple hash */ return lb_hash_select_tx_port(team, hash); } struct lb_select_tx_port { char *name; lb_select_tx_port_func_t *func; }; static const struct lb_select_tx_port lb_select_tx_port_list[] = { { .name = "hash", .func = lb_hash_select_tx_port, }, { .name = "hash_to_port_mapping", .func = lb_htpm_select_tx_port, }, }; #define LB_SELECT_TX_PORT_LIST_COUNT ARRAY_SIZE(lb_select_tx_port_list) static char *lb_select_tx_port_get_name(lb_select_tx_port_func_t *func) { int i; for (i = 0; i < LB_SELECT_TX_PORT_LIST_COUNT; i++) { const struct lb_select_tx_port *item; item = &lb_select_tx_port_list[i]; if (item->func == func) return item->name; } return NULL; } static lb_select_tx_port_func_t *lb_select_tx_port_get_func(const char *name) { int i; for (i = 0; i < LB_SELECT_TX_PORT_LIST_COUNT; i++) { const struct lb_select_tx_port *item; item = &lb_select_tx_port_list[i]; if (!strcmp(item->name, name)) return item->func; } return NULL; } static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv, struct sk_buff *skb) { struct bpf_prog *fp; uint32_t lhash; unsigned char *c; fp = rcu_dereference_bh(lb_priv->fp); if (unlikely(!fp)) return 0; lhash = bpf_prog_run(fp, skb); c = (char *) &lhash; return c[0] ^ c[1] ^ c[2] ^ c[3]; } static void lb_update_tx_stats(unsigned int tx_bytes, struct lb_priv *lb_priv, struct lb_port_priv *lb_port_priv, unsigned char hash) { struct lb_pcpu_stats *pcpu_stats; struct lb_stats *port_stats; struct lb_stats *hash_stats; pcpu_stats = this_cpu_ptr(lb_priv->pcpu_stats); port_stats = this_cpu_ptr(lb_port_priv->pcpu_stats); hash_stats = &pcpu_stats->hash_stats[hash]; u64_stats_update_begin(&pcpu_stats->syncp); port_stats->tx_bytes += tx_bytes; hash_stats->tx_bytes += tx_bytes; u64_stats_update_end(&pcpu_stats->syncp); } static bool lb_transmit(struct team *team, struct sk_buff *skb) { struct lb_priv *lb_priv = get_lb_priv(team); lb_select_tx_port_func_t *select_tx_port_func; struct team_port *port; unsigned char hash; unsigned int tx_bytes = skb->len; hash = lb_get_skb_hash(lb_priv, skb); select_tx_port_func = rcu_dereference_bh(lb_priv->select_tx_port_func); port = select_tx_port_func(team, hash); if (unlikely(!port)) goto drop; if (team_dev_queue_xmit(team, port, skb)) return false; lb_update_tx_stats(tx_bytes, lb_priv, get_lb_port_priv(port), hash); return true; drop: dev_kfree_skb_any(skb); return false; } static void lb_bpf_func_get(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); if (!lb_priv->ex->orig_fprog) { ctx->data.bin_val.len = 0; ctx->data.bin_val.ptr = NULL; return; } ctx->data.bin_val.len = lb_priv->ex->orig_fprog->len * sizeof(struct sock_filter); ctx->data.bin_val.ptr = lb_priv->ex->orig_fprog->filter; } static int __fprog_create(struct sock_fprog_kern **pfprog, u32 data_len, const void *data) { struct sock_fprog_kern *fprog; struct sock_filter *filter = (struct sock_filter *) data; if (data_len % sizeof(struct sock_filter)) return -EINVAL; fprog = kmalloc(sizeof(*fprog), GFP_KERNEL); if (!fprog) return -ENOMEM; fprog->filter = kmemdup(filter, data_len, GFP_KERNEL); if (!fprog->filter) { kfree(fprog); return -ENOMEM; } fprog->len = data_len / sizeof(struct sock_filter); *pfprog = fprog; return 0; } static void __fprog_destroy(struct sock_fprog_kern *fprog) { kfree(fprog->filter); kfree(fprog); } static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); struct bpf_prog *fp = NULL; struct bpf_prog *orig_fp = NULL; struct sock_fprog_kern *fprog = NULL; int err; if (ctx->data.bin_val.len) { err = __fprog_create(&fprog, ctx->data.bin_val.len, ctx->data.bin_val.ptr); if (err) return err; err = bpf_prog_create(&fp, fprog); if (err) { __fprog_destroy(fprog); return err; } } if (lb_priv->ex->orig_fprog) { /* Clear old filter data */ __fprog_destroy(lb_priv->ex->orig_fprog); orig_fp = rcu_dereference_protected(lb_priv->fp, lockdep_is_held(&team->lock)); } rcu_assign_pointer(lb_priv->fp, fp); lb_priv->ex->orig_fprog = fprog; if (orig_fp) { synchronize_rcu(); bpf_prog_destroy(orig_fp); } return 0; } static void lb_bpf_func_free(struct team *team) { struct lb_priv *lb_priv = get_lb_priv(team); struct bpf_prog *fp; if (!lb_priv->ex->orig_fprog) return; __fprog_destroy(lb_priv->ex->orig_fprog); fp = rcu_dereference_protected(lb_priv->fp, lockdep_is_held(&team->lock)); bpf_prog_destroy(fp); } static void lb_tx_method_get(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); lb_select_tx_port_func_t *func; char *name; func = rcu_dereference_protected(lb_priv->select_tx_port_func, lockdep_is_held(&team->lock)); name = lb_select_tx_port_get_name(func); BUG_ON(!name); ctx->data.str_val = name; } static int lb_tx_method_set(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); lb_select_tx_port_func_t *func; func = lb_select_tx_port_get_func(ctx->data.str_val); if (!func) return -EINVAL; rcu_assign_pointer(lb_priv->select_tx_port_func, func); return 0; } static void lb_tx_hash_to_port_mapping_init(struct team *team, struct team_option_inst_info *info) { struct lb_priv *lb_priv = get_lb_priv(team); unsigned char hash = info->array_index; LB_HTPM_OPT_INST_INFO_BY_HASH(lb_priv, hash) = info; } static void lb_tx_hash_to_port_mapping_get(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); struct team_port *port; unsigned char hash = ctx->info->array_index; port = LB_HTPM_PORT_BY_HASH(lb_priv, hash); ctx->data.u32_val = port ? port->dev->ifindex : 0; } static int lb_tx_hash_to_port_mapping_set(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); struct team_port *port; unsigned char hash = ctx->info->array_index; list_for_each_entry(port, &team->port_list, list) { if (ctx->data.u32_val == port->dev->ifindex && team_port_enabled(port)) { rcu_assign_pointer(LB_HTPM_PORT_BY_HASH(lb_priv, hash), port); return 0; } } return -ENODEV; } static void lb_hash_stats_init(struct team *team, struct team_option_inst_info *info) { struct lb_priv *lb_priv = get_lb_priv(team); unsigned char hash = info->array_index; lb_priv->ex->stats.info[hash].opt_inst_info = info; } static void lb_hash_stats_get(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); unsigned char hash = ctx->info->array_index; ctx->data.bin_val.ptr = &lb_priv->ex->stats.info[hash].stats; ctx->data.bin_val.len = sizeof(struct lb_stats); } static void lb_port_stats_init(struct team *team, struct team_option_inst_info *info) { struct team_port *port = info->port; struct lb_port_priv *lb_port_priv = get_lb_port_priv(port); lb_port_priv->stats_info.opt_inst_info = info; } static void lb_port_stats_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; struct lb_port_priv *lb_port_priv = get_lb_port_priv(port); ctx->data.bin_val.ptr = &lb_port_priv->stats_info.stats; ctx->data.bin_val.len = sizeof(struct lb_stats); } static void __lb_stats_info_refresh_prepare(struct lb_stats_info *s_info) { memcpy(&s_info->last_stats, &s_info->stats, sizeof(struct lb_stats)); memset(&s_info->stats, 0, sizeof(struct lb_stats)); } static bool __lb_stats_info_refresh_check(struct lb_stats_info *s_info, struct team *team) { if (memcmp(&s_info->last_stats, &s_info->stats, sizeof(struct lb_stats))) { team_option_inst_set_change(s_info->opt_inst_info); return true; } return false; } static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats, struct lb_stats *cpu_stats, struct u64_stats_sync *syncp) { unsigned int start; struct lb_stats tmp; do { start = u64_stats_fetch_begin(syncp); tmp.tx_bytes = cpu_stats->tx_bytes; } while (u64_stats_fetch_retry(syncp, start)); acc_stats->tx_bytes += tmp.tx_bytes; } static void lb_stats_refresh(struct work_struct *work) { struct team *team; struct lb_priv *lb_priv; struct lb_priv_ex *lb_priv_ex; struct lb_pcpu_stats *pcpu_stats; struct lb_stats *stats; struct lb_stats_info *s_info; struct team_port *port; bool changed = false; int i; int j; lb_priv_ex = container_of(work, struct lb_priv_ex, stats.refresh_dw.work); team = lb_priv_ex->team; lb_priv = get_lb_priv(team); if (!mutex_trylock(&team->lock)) { schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0); return; } for (j = 0; j < LB_TX_HASHTABLE_SIZE; j++) { s_info = &lb_priv->ex->stats.info[j]; __lb_stats_info_refresh_prepare(s_info); for_each_possible_cpu(i) { pcpu_stats = per_cpu_ptr(lb_priv->pcpu_stats, i); stats = &pcpu_stats->hash_stats[j]; __lb_one_cpu_stats_add(&s_info->stats, stats, &pcpu_stats->syncp); } changed |= __lb_stats_info_refresh_check(s_info, team); } list_for_each_entry(port, &team->port_list, list) { struct lb_port_priv *lb_port_priv = get_lb_port_priv(port); s_info = &lb_port_priv->stats_info; __lb_stats_info_refresh_prepare(s_info); for_each_possible_cpu(i) { pcpu_stats = per_cpu_ptr(lb_priv->pcpu_stats, i); stats = per_cpu_ptr(lb_port_priv->pcpu_stats, i); __lb_one_cpu_stats_add(&s_info->stats, stats, &pcpu_stats->syncp); } changed |= __lb_stats_info_refresh_check(s_info, team); } if (changed) team_options_change_check(team); schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, (lb_priv_ex->stats.refresh_interval * HZ) / 10); mutex_unlock(&team->lock); } static void lb_stats_refresh_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); ctx->data.u32_val = lb_priv->ex->stats.refresh_interval; } static int lb_stats_refresh_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { struct lb_priv *lb_priv = get_lb_priv(team); unsigned int interval; interval = ctx->data.u32_val; if (lb_priv->ex->stats.refresh_interval == interval) return 0; lb_priv->ex->stats.refresh_interval = interval; if (interval) schedule_delayed_work(&lb_priv->ex->stats.refresh_dw, 0); else cancel_delayed_work(&lb_priv->ex->stats.refresh_dw); return 0; } static const struct team_option lb_options[] = { { .name = "bpf_hash_func", .type = TEAM_OPTION_TYPE_BINARY, .getter = lb_bpf_func_get, .setter = lb_bpf_func_set, }, { .name = "lb_tx_method", .type = TEAM_OPTION_TYPE_STRING, .getter = lb_tx_method_get, .setter = lb_tx_method_set, }, { .name = "lb_tx_hash_to_port_mapping", .array_size = LB_TX_HASHTABLE_SIZE, .type = TEAM_OPTION_TYPE_U32, .init = lb_tx_hash_to_port_mapping_init, .getter = lb_tx_hash_to_port_mapping_get, .setter = lb_tx_hash_to_port_mapping_set, }, { .name = "lb_hash_stats", .array_size = LB_TX_HASHTABLE_SIZE, .type = TEAM_OPTION_TYPE_BINARY, .init = lb_hash_stats_init, .getter = lb_hash_stats_get, }, { .name = "lb_port_stats", .per_port = true, .type = TEAM_OPTION_TYPE_BINARY, .init = lb_port_stats_init, .getter = lb_port_stats_get, }, { .name = "lb_stats_refresh_interval", .type = TEAM_OPTION_TYPE_U32, .getter = lb_stats_refresh_interval_get, .setter = lb_stats_refresh_interval_set, }, }; static int lb_init(struct team *team) { struct lb_priv *lb_priv = get_lb_priv(team); lb_select_tx_port_func_t *func; int i, err; /* set default tx port selector */ func = lb_select_tx_port_get_func("hash"); BUG_ON(!func); rcu_assign_pointer(lb_priv->select_tx_port_func, func); lb_priv->ex = kzalloc(sizeof(*lb_priv->ex), GFP_KERNEL); if (!lb_priv->ex) return -ENOMEM; lb_priv->ex->team = team; lb_priv->pcpu_stats = alloc_percpu(struct lb_pcpu_stats); if (!lb_priv->pcpu_stats) { err = -ENOMEM; goto err_alloc_pcpu_stats; } for_each_possible_cpu(i) { struct lb_pcpu_stats *team_lb_stats; team_lb_stats = per_cpu_ptr(lb_priv->pcpu_stats, i); u64_stats_init(&team_lb_stats->syncp); } INIT_DELAYED_WORK(&lb_priv->ex->stats.refresh_dw, lb_stats_refresh); err = team_options_register(team, lb_options, ARRAY_SIZE(lb_options)); if (err) goto err_options_register; return 0; err_options_register: free_percpu(lb_priv->pcpu_stats); err_alloc_pcpu_stats: kfree(lb_priv->ex); return err; } static void lb_exit(struct team *team) { struct lb_priv *lb_priv = get_lb_priv(team); team_options_unregister(team, lb_options, ARRAY_SIZE(lb_options)); lb_bpf_func_free(team); cancel_delayed_work_sync(&lb_priv->ex->stats.refresh_dw); free_percpu(lb_priv->pcpu_stats); kfree(lb_priv->ex); } static int lb_port_enter(struct team *team, struct team_port *port) { struct lb_port_priv *lb_port_priv = get_lb_port_priv(port); lb_port_priv->pcpu_stats = alloc_percpu(struct lb_stats); if (!lb_port_priv->pcpu_stats) return -ENOMEM; return 0; } static void lb_port_leave(struct team *team, struct team_port *port) { struct lb_port_priv *lb_port_priv = get_lb_port_priv(port); free_percpu(lb_port_priv->pcpu_stats); } static void lb_port_disabled(struct team *team, struct team_port *port) { lb_tx_hash_to_port_mapping_null_port(team, port); } static const struct team_mode_ops lb_mode_ops = { .init = lb_init, .exit = lb_exit, .port_enter = lb_port_enter, .port_leave = lb_port_leave, .port_disabled = lb_port_disabled, .receive = lb_receive, .transmit = lb_transmit, }; static const struct team_mode lb_mode = { .kind = "loadbalance", .owner = THIS_MODULE, .priv_size = sizeof(struct lb_priv), .port_priv_size = sizeof(struct lb_port_priv), .ops = &lb_mode_ops, .lag_tx_type = NETDEV_LAG_TX_TYPE_HASH, }; static int __init lb_init_module(void) { return team_mode_register(&lb_mode); } static void __exit lb_cleanup_module(void) { team_mode_unregister(&lb_mode); } module_init(lb_init_module); module_exit(lb_cleanup_module); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>"); MODULE_DESCRIPTION("Load-balancing mode for team"); MODULE_ALIAS_TEAM_MODE("loadbalance"); |
6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Linux input subsystem * * Copyright (c) 2006 Anssi Hannula <anssi.hannula@gmail.com> * Copyright (c) 2006 Dmitry Torokhov <dtor@mail.ru> */ /* #define DEBUG */ #include <linux/input.h> #include <linux/limits.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/sched.h> #include <linux/slab.h> /* * Check that the effect_id is a valid effect and whether the user * is the owner */ static int check_effect_access(struct ff_device *ff, int effect_id, struct file *file) { if (effect_id < 0 || effect_id >= ff->max_effects || !ff->effect_owners[effect_id]) return -EINVAL; if (file && ff->effect_owners[effect_id] != file) return -EACCES; return 0; } /* * Checks whether 2 effects can be combined together */ static inline int check_effects_compatible(struct ff_effect *e1, struct ff_effect *e2) { return e1->type == e2->type && (e1->type != FF_PERIODIC || e1->u.periodic.waveform == e2->u.periodic.waveform); } /* * Convert an effect into compatible one */ static int compat_effect(struct ff_device *ff, struct ff_effect *effect) { int magnitude; switch (effect->type) { case FF_RUMBLE: if (!test_bit(FF_PERIODIC, ff->ffbit)) return -EINVAL; /* * calculate magnitude of sine wave as average of rumble's * 2/3 of strong magnitude and 1/3 of weak magnitude */ magnitude = effect->u.rumble.strong_magnitude / 3 + effect->u.rumble.weak_magnitude / 6; effect->type = FF_PERIODIC; effect->u.periodic.waveform = FF_SINE; effect->u.periodic.period = 50; effect->u.periodic.magnitude = magnitude; effect->u.periodic.offset = 0; effect->u.periodic.phase = 0; effect->u.periodic.envelope.attack_length = 0; effect->u.periodic.envelope.attack_level = 0; effect->u.periodic.envelope.fade_length = 0; effect->u.periodic.envelope.fade_level = 0; return 0; default: /* Let driver handle conversion */ return 0; } } /** * input_ff_upload() - upload effect into force-feedback device * @dev: input device * @effect: effect to be uploaded * @file: owner of the effect */ int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file) { struct ff_device *ff = dev->ff; struct ff_effect *old; int error; int id; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; if (effect->type < FF_EFFECT_MIN || effect->type > FF_EFFECT_MAX || !test_bit(effect->type, dev->ffbit)) { dev_dbg(&dev->dev, "invalid or not supported effect type in upload\n"); return -EINVAL; } if (effect->type == FF_PERIODIC && (effect->u.periodic.waveform < FF_WAVEFORM_MIN || effect->u.periodic.waveform > FF_WAVEFORM_MAX || !test_bit(effect->u.periodic.waveform, dev->ffbit))) { dev_dbg(&dev->dev, "invalid or not supported wave form in upload\n"); return -EINVAL; } if (!test_bit(effect->type, ff->ffbit)) { error = compat_effect(ff, effect); if (error) return error; } guard(mutex)(&ff->mutex); if (effect->id == -1) { for (id = 0; id < ff->max_effects; id++) if (!ff->effect_owners[id]) break; if (id >= ff->max_effects) return -ENOSPC; effect->id = id; old = NULL; } else { id = effect->id; error = check_effect_access(ff, id, file); if (error) return error; old = &ff->effects[id]; if (!check_effects_compatible(effect, old)) return -EINVAL; } error = ff->upload(dev, effect, old); if (error) return error; scoped_guard(spinlock_irq, &dev->event_lock) { ff->effects[id] = *effect; ff->effect_owners[id] = file; } return 0; } EXPORT_SYMBOL_GPL(input_ff_upload); /* * Erases the effect if the requester is also the effect owner. The mutex * should already be locked before calling this function. */ static int erase_effect(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; int error; error = check_effect_access(ff, effect_id, file); if (error) return error; scoped_guard(spinlock_irq, &dev->event_lock) { ff->playback(dev, effect_id, 0); ff->effect_owners[effect_id] = NULL; } if (ff->erase) { error = ff->erase(dev, effect_id); if (error) { scoped_guard(spinlock_irq, &dev->event_lock) ff->effect_owners[effect_id] = file; return error; } } return 0; } /** * input_ff_erase - erase a force-feedback effect from device * @dev: input device to erase effect from * @effect_id: id of the effect to be erased * @file: purported owner of the request * * This function erases a force-feedback effect from specified device. * The effect will only be erased if it was uploaded through the same * file handle that is requesting erase. */ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; guard(mutex)(&ff->mutex); return erase_effect(dev, effect_id, file); } EXPORT_SYMBOL_GPL(input_ff_erase); /* * input_ff_flush - erase all effects owned by a file handle * @dev: input device to erase effect from * @file: purported owner of the effects * * This function erases all force-feedback effects associated with * the given owner from specified device. Note that @file may be %NULL, * in which case all effects will be erased. */ int input_ff_flush(struct input_dev *dev, struct file *file) { struct ff_device *ff = dev->ff; int i; dev_dbg(&dev->dev, "flushing now\n"); guard(mutex)(&ff->mutex); for (i = 0; i < ff->max_effects; i++) erase_effect(dev, i, file); return 0; } EXPORT_SYMBOL_GPL(input_ff_flush); /** * input_ff_event() - generic handler for force-feedback events * @dev: input device to send the effect to * @type: event type (anything but EV_FF is ignored) * @code: event code * @value: event value */ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct ff_device *ff = dev->ff; if (type != EV_FF) return 0; switch (code) { case FF_GAIN: if (!test_bit(FF_GAIN, dev->ffbit) || value > 0xffffU) break; ff->set_gain(dev, value); break; case FF_AUTOCENTER: if (!test_bit(FF_AUTOCENTER, dev->ffbit) || value > 0xffffU) break; ff->set_autocenter(dev, value); break; default: if (check_effect_access(ff, code, NULL) == 0) ff->playback(dev, code, value); break; } return 0; } EXPORT_SYMBOL_GPL(input_ff_event); /** * input_ff_create() - create force-feedback device * @dev: input device supporting force-feedback * @max_effects: maximum number of effects supported by the device * * This function allocates all necessary memory for a force feedback * portion of an input device and installs all default handlers. * @dev->ffbit should be already set up before calling this function. * Once ff device is created you need to setup its upload, erase, * playback and other handlers before registering input device */ int input_ff_create(struct input_dev *dev, unsigned int max_effects) { int i; if (!max_effects) { dev_err(&dev->dev, "cannot allocate device without any effects\n"); return -EINVAL; } if (max_effects > FF_MAX_EFFECTS) { dev_err(&dev->dev, "cannot allocate more than FF_MAX_EFFECTS effects\n"); return -EINVAL; } struct ff_device *ff __free(kfree) = kzalloc(struct_size(ff, effect_owners, max_effects), GFP_KERNEL); if (!ff) return -ENOMEM; ff->effects = kcalloc(max_effects, sizeof(*ff->effects), GFP_KERNEL); if (!ff->effects) return -ENOMEM; ff->max_effects = max_effects; mutex_init(&ff->mutex); dev->flush = input_ff_flush; dev->event = input_ff_event; __set_bit(EV_FF, dev->evbit); /* Copy "true" bits into ff device bitmap */ for_each_set_bit(i, dev->ffbit, FF_CNT) __set_bit(i, ff->ffbit); /* we can emulate RUMBLE with periodic effects */ if (test_bit(FF_PERIODIC, ff->ffbit)) __set_bit(FF_RUMBLE, dev->ffbit); dev->ff = no_free_ptr(ff); return 0; } EXPORT_SYMBOL_GPL(input_ff_create); /** * input_ff_destroy() - frees force feedback portion of input device * @dev: input device supporting force feedback * * This function is only needed in error path as input core will * automatically free force feedback structures when device is * destroyed. */ void input_ff_destroy(struct input_dev *dev) { struct ff_device *ff = dev->ff; __clear_bit(EV_FF, dev->evbit); if (ff) { if (ff->destroy) ff->destroy(ff); kfree(ff->private); kfree(ff->effects); kfree(ff); dev->ff = NULL; } } EXPORT_SYMBOL_GPL(input_ff_destroy); |
20 525 20 20 20 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/netns/ipv6.h> #include <net/ip6_fib.h> int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifiers(net, event_type, info); } static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } static int fib6_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib6_rules_dump(net, nb, extack); if (err) return err; return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { .family = AF_INET6, .fib_seq_read = fib6_seq_read, .fib_dump = fib6_dump, .owner = THIS_MODULE, }; int __net_init fib6_notifier_init(struct net *net) { struct fib_notifier_ops *ops; ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.notifier_ops = ops; return 0; } void __net_exit fib6_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.notifier_ops); } |
875 876 879 6735 6725 600 6396 2 6731 6723 3864 3869 3874 6331 2 6333 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 | // SPDX-License-Identifier: GPL-2.0 /* * mm/pgtable-generic.c * * Generic pgtable methods declared in linux/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> #include <asm/pgalloc.h> #include <asm/tlb.h> /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */ void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } #endif /* * Note that the pmd variant below can't be stub'ed out just as for p4d/pud * above. pmd folding is special and typically pmd_* macros refer to upper * level even when folded */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This * used to be done in the caller, but sparc needs minor faults to * force that call on sun4c so we changed this macro slightly */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); } return changed; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young; young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); return young; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { struct mm_struct *mm = (vma)->vm_mm; pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed) { set_pmd_at(vma->vm_mm, address, pmdp, entry); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; } #endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, struct page, lru); if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); return pgtable; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * pmd and hugepage pte format are same. So we could * use the same function. */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ #ifndef pte_free_defer static void pte_free_now(struct rcu_head *head) { struct page *page; page = container_of(head, struct page, rcu_head); pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { struct page *page; page = pgtable; call_rcu(&page->rcu_head, pte_free_now); } #endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) /* * See the comment above ptep_get_lockless() in include/linux/pgtable.h: * the barriers in pmdp_get_lockless() cannot guarantee that the value in * pmd_high actually belongs with the value in pmd_low; but holding interrupts * off blocks the TLB flush between present updates, which guarantees that a * successful __pte_offset_map() points to a page from matched halves. */ static unsigned long pmdp_get_lockless_start(void) { unsigned long irqflags; local_irq_save(irqflags); return irqflags; } static void pmdp_get_lockless_end(unsigned long irqflags) { local_irq_restore(irqflags); } #else static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); pmdp_get_lockless_end(irqflags); if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); goto nomap; } return __pte_map(&pmdval, addr); nomap: rcu_read_unlock(); return NULL; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; pte = __pte_offset_map(pmd, addr, &pmdval); if (likely(pte)) *ptlp = pte_lockptr(mm, &pmdval); return pte; } pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp) { pte_t *pte; VM_WARN_ON_ONCE(!pmdvalp); pte = __pte_offset_map(pmd, addr, pmdvalp); if (likely(pte)) *ptlp = pte_lockptr(mm, pmdvalp); return pte; } /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while * holding mmap_lock or vma lock for read or for write; or in truncate or rmap * context, while holding file's i_mmap_lock or anon_vma lock for read (or for * write). In a few cases, it may be used with pmd pointing to a pmd_t already * copied to or constructed on the stack. * * When successful, it returns the pte pointer for addr, with its page table * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent * modification by software, with a pointer to that spinlock in ptlp (in some * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. * * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no * page table at *pmd: if, for example, the page table has just been removed, * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked * after acquiring the ptlock, and retried internally if it changed: so that a * page table can be safely removed or replaced by THP while holding its lock.) * * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, * just returns the pte pointer for addr, its page table kmapped if necessary; * or NULL if there is no page table at *pmd. It does not attempt to lock the * page table, so cannot normally be used when the page table is to be updated, * or when entries read must be stable. But it does take rcu_read_lock(): so * that even when page table is racily removed, it remains a valid though empty * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock * pointer for the page table that it returns. Even after grabbing the spinlock, * we might be looking either at a page table that is still mapped or one that * was unmapped and is about to get freed. But for R/O access this is sufficient. * So it is only applicable for read-only cases where any modification operations * to the page table are not allowed even if the corresponding spinlock is held * afterwards. * * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. * It is applicable for may-write cases where any modification operations to the * page table may happen after the corresponding spinlock is held afterwards. * But the users should make sure the page table is stable like checking pte_same() * or checking pmd_same() by using the output pmdval before performing the write * operations. * * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will * be read-only/read-write protected. * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { spinlock_t *ptl; pmd_t pmdval; pte_t *pte; again: pte = __pte_offset_map(pmd, addr, &pmdval); if (unlikely(!pte)) return pte; ptl = pte_lockptr(mm, &pmdval); spin_lock(ptl); if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { *ptlp = ptl; return pte; } pte_unmap_unlock(pte, ptl); goto again; } |
4 4 1 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/phy.h> #include <linux/ethtool_netlink.h> #include "netlink.h" #include "common.h" struct plca_req_info { struct ethnl_req_info base; }; struct plca_reply_data { struct ethnl_reply_data base; struct phy_plca_cfg plca_cfg; struct phy_plca_status plca_st; }; // Helpers ------------------------------------------------------------------ // #define PLCA_REPDATA(__reply_base) \ container_of(__reply_base, struct plca_reply_data, base) // PLCA get configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_get_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid, bool *mod) { const struct nlattr *attr = tb[attrid]; if (!attr || WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy))) return; switch (ethnl_plca_set_cfg_policy[attrid].type) { case NLA_U8: *dst = nla_get_u8(attr); break; case NLA_U32: *dst = nla_get_u32(attr); break; default: WARN_ON_ONCE(1); } *mod = true; } static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } // note: rtnl_lock is held already by ethnl_default_doit ops = ethtool_phy_ops; if (!ops || !ops->get_plca_cfg) { ret = -EOPNOTSUPP; goto out; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out; memset(&data->plca_cfg, 0xff, sizeof_field(struct plca_reply_data, plca_cfg)); ret = ops->get_plca_cfg(phydev, &data->plca_cfg); ethnl_ops_complete(dev); out: return ret; } static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u16)) + /* _VERSION */ nla_total_size(sizeof(u8)) + /* _ENABLED */ nla_total_size(sizeof(u32)) + /* _NODE_CNT */ nla_total_size(sizeof(u32)) + /* _NODE_ID */ nla_total_size(sizeof(u32)) + /* _TO_TIMER */ nla_total_size(sizeof(u32)) + /* _BURST_COUNT */ nla_total_size(sizeof(u32)); /* _BURST_TIMER */ } static int plca_get_cfg_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct plca_reply_data *data = PLCA_REPDATA(reply_base); const struct phy_plca_cfg *plca = &data->plca_cfg; if ((plca->version >= 0 && nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) || (plca->enabled >= 0 && nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) || (plca->node_id >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) || (plca->node_cnt >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) || (plca->to_tmr >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) || (plca->burst_cnt >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) || (plca->burst_tmr >= 0 && nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr))) return -EMSGSIZE; return 0; }; // PLCA set configuration message ------------------------------------------- // const struct nla_policy ethnl_plca_set_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255), }; static int ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_plca_cfg plca_cfg; struct phy_device *phydev; bool mod = false; int ret; phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PLCA_HEADER], info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) return -EOPNOTSUPP; ops = ethtool_phy_ops; if (!ops || !ops->set_plca_cfg) return -EOPNOTSUPP; memset(&plca_cfg, 0xff, sizeof(plca_cfg)); plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod); plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod); plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod); plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod); plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT, &mod); plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR, &mod); if (!mod) return 0; ret = ops->set_plca_cfg(phydev, &plca_cfg, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG, .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY, .hdr_attr = ETHTOOL_A_PLCA_HEADER, .req_info_size = sizeof(struct plca_req_info), .reply_data_size = sizeof(struct plca_reply_data), .prepare_data = plca_get_cfg_prepare_data, .reply_size = plca_get_cfg_reply_size, .fill_reply = plca_get_cfg_fill_reply, .set = ethnl_set_plca, .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF, }; // PLCA get status message -------------------------------------------------- // const struct nla_policy ethnl_plca_get_status_policy[] = { [ETHTOOL_A_PLCA_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_device *phydev; int ret; phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } // note: rtnl_lock is held already by ethnl_default_doit ops = ethtool_phy_ops; if (!ops || !ops->get_plca_status) { ret = -EOPNOTSUPP; goto out; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out; memset(&data->plca_st, 0xff, sizeof_field(struct plca_reply_data, plca_st)); ret = ops->get_plca_status(phydev, &data->plca_st); ethnl_ops_complete(dev); out: return ret; } static int plca_get_status_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)); /* _STATUS */ } static int plca_get_status_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct plca_reply_data *data = PLCA_REPDATA(reply_base); const u8 status = data->plca_st.pst; if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status)) return -EMSGSIZE; return 0; }; const struct ethnl_request_ops ethnl_plca_status_request_ops = { .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, .hdr_attr = ETHTOOL_A_PLCA_HEADER, .req_info_size = sizeof(struct plca_req_info), .reply_data_size = sizeof(struct plca_reply_data), .prepare_data = plca_get_status_prepare_data, .reply_size = plca_get_status_reply_size, .fill_reply = plca_get_status_fill_reply, }; |
324 227 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __VDSO_MATH64_H #define __VDSO_MATH64_H static __always_inline u32 __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { u32 ret = 0; while (dividend >= divisor) { /* The following asm() prevents the compiler from optimising this loop into a modulo operation. */ asm("" : "+rm"(dividend)); dividend -= divisor; ret++; } *remainder = dividend; return ret; } #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_add_u64_shr static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { return (u64)((((unsigned __int128)a * mul) + b) >> shift); } #endif /* mul_u64_u32_add_u64_shr */ #else #ifndef mul_u64_u32_add_u64_shr #ifndef mul_u32_u32 static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #define mul_u32_u32 mul_u32_u32 #endif static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { u32 ah = a >> 32, al = a; bool ovf; u64 ret; ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret); ret >>= shift; if (ovf && shift) ret += 1ULL << (64 - shift); if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_add_u64_shr */ #endif #endif /* __VDSO_MATH64_H */ |
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 13 13 1 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Fredy Neeser */ /* Greg Joyce <greg@opengridcomputing.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> #include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include "siw.h" #include "siw_cm.h" /* * Set to any combination of * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR */ static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; static const bool relaxed_ird_negotiation = true; static void siw_cm_llp_state_change(struct sock *s); static void siw_cm_llp_data_ready(struct sock *s); static void siw_cm_llp_write_space(struct sock *s); static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); write_lock_bh(&sk->sk_callback_lock); cep->sk_state_change = sk->sk_state_change; cep->sk_data_ready = sk->sk_data_ready; cep->sk_write_space = sk->sk_write_space; cep->sk_error_report = sk->sk_error_report; sk->sk_state_change = siw_cm_llp_state_change; sk->sk_data_ready = siw_cm_llp_data_ready; sk->sk_write_space = siw_cm_llp_write_space; sk->sk_error_report = siw_cm_llp_error_report; write_unlock_bh(&sk->sk_callback_lock); } static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) { sk->sk_state_change = cep->sk_state_change; sk->sk_data_ready = cep->sk_data_ready; sk->sk_write_space = cep->sk_write_space; sk->sk_error_report = cep->sk_error_report; sk->sk_user_data = NULL; } static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) { struct socket *s = cep->sock; struct sock *sk = s->sk; write_lock_bh(&sk->sk_callback_lock); qp->attrs.sk = s; sk->sk_data_ready = siw_qp_llp_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_socket_disassoc(struct socket *s) { struct sock *sk = s->sk; struct siw_cep *cep; if (sk) { write_lock_bh(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (cep) { siw_sk_restore_upcalls(sk, cep); siw_cep_put(cep); } else { pr_warn("siw: cannot restore sk callbacks: no ep\n"); } write_unlock_bh(&sk->sk_callback_lock); } else { pr_warn("siw: cannot restore sk callbacks: no sk\n"); } } static void siw_rtr_data_ready(struct sock *sk) { struct siw_cep *cep; struct siw_qp *qp = NULL; read_descriptor_t rd_desc; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { WARN(1, "No connection endpoint\n"); goto out; } qp = sk_to_qp(sk); memset(&rd_desc, 0, sizeof(rd_desc)); rd_desc.arg.data = qp; rd_desc.count = 1; tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop. */ if (!qp->rx_stream.rx_suspend) siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); out: read_unlock(&sk->sk_callback_lock); if (qp) siw_qp_socket_assoc(cep, qp); } static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) { struct sock *sk = cep->sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = siw_rtr_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) { cep->sock = s; siw_cep_get(cep); s->sk->sk_user_data = cep; siw_sk_assign_cm_upcalls(s->sk); } static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) { struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); unsigned long flags; if (!cep) return NULL; INIT_LIST_HEAD(&cep->listenq); INIT_LIST_HEAD(&cep->devq); INIT_LIST_HEAD(&cep->work_freelist); kref_init(&cep->ref); cep->state = SIW_EPSTATE_IDLE; init_waitqueue_head(&cep->waitq); spin_lock_init(&cep->lock); cep->sdev = sdev; cep->enhanced_rdma_conn_est = false; spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&cep->devq, &sdev->cep_list); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "new endpoint\n"); return cep; } static void siw_cm_free_work(struct siw_cep *cep) { struct list_head *w, *tmp; struct siw_cm_work *work; list_for_each_safe(w, tmp, &cep->work_freelist) { work = list_entry(w, struct siw_cm_work, list); list_del(&work->list); kfree(work); } } static void siw_cancel_mpatimer(struct siw_cep *cep) { spin_lock_bh(&cep->lock); if (cep->mpa_timer) { if (cancel_delayed_work(&cep->mpa_timer->work)) { siw_cep_put(cep); kfree(cep->mpa_timer); /* not needed again */ } cep->mpa_timer = NULL; } spin_unlock_bh(&cep->lock); } static void siw_put_work(struct siw_cm_work *work) { INIT_LIST_HEAD(&work->list); spin_lock_bh(&work->cep->lock); list_add(&work->list, &work->cep->work_freelist); spin_unlock_bh(&work->cep->lock); } static void siw_cep_set_inuse(struct siw_cep *cep) { unsigned long flags; retry: spin_lock_irqsave(&cep->lock, flags); if (cep->in_use) { spin_unlock_irqrestore(&cep->lock, flags); wait_event_interruptible(cep->waitq, !cep->in_use); if (signal_pending(current)) flush_signals(current); goto retry; } else { cep->in_use = 1; spin_unlock_irqrestore(&cep->lock, flags); } } static void siw_cep_set_free(struct siw_cep *cep) { unsigned long flags; spin_lock_irqsave(&cep->lock, flags); cep->in_use = 0; spin_unlock_irqrestore(&cep->lock, flags); wake_up(&cep->waitq); } static void __siw_cep_dealloc(struct kref *ref) { struct siw_cep *cep = container_of(ref, struct siw_cep, ref); struct siw_device *sdev = cep->sdev; unsigned long flags; WARN_ON(cep->listen_cep); /* kfree(NULL) is safe */ kfree(cep->mpa.pdata); spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) siw_cm_free_work(cep); spin_unlock_bh(&cep->lock); spin_lock_irqsave(&sdev->lock, flags); list_del(&cep->devq); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "free endpoint\n"); kfree(cep); } static struct siw_cm_work *siw_get_work(struct siw_cep *cep) { struct siw_cm_work *work = NULL; spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) { work = list_entry(cep->work_freelist.next, struct siw_cm_work, list); list_del_init(&work->list); } spin_unlock_bh(&cep->lock); return work; } static int siw_cm_alloc_work(struct siw_cep *cep, int num) { struct siw_cm_work *work; while (num--) { work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) { if (!(list_empty(&cep->work_freelist))) siw_cm_free_work(cep); return -ENOMEM; } work->cep = cep; INIT_LIST_HEAD(&work->list); list_add(&work->list, &cep->work_freelist); } return 0; } /* * siw_cm_upcall() * * Upcall to IWCM to inform about async connection events */ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status) { struct iw_cm_event event; struct iw_cm_id *id; memset(&event, 0, sizeof(event)); event.status = status; event.event = reason; if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.provider_data = cep; id = cep->listen_cep->cm_id; } else { id = cep->cm_id; } /* Signal IRD and ORD */ if (reason == IW_CM_EVENT_ESTABLISHED || reason == IW_CM_EVENT_CONNECT_REPLY) { /* Signal negotiated IRD/ORD values we will use */ event.ird = cep->ird; event.ord = cep->ord; } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.ird = cep->ord; event.ord = cep->ird; } /* Signal private data and address information */ if (reason == IW_CM_EVENT_CONNECT_REQUEST || reason == IW_CM_EVENT_CONNECT_REPLY) { u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); if (pd_len) { /* * hand over MPA private data */ event.private_data_len = pd_len; event.private_data = cep->mpa.pdata; /* Hide MPA V2 IRD/ORD control */ if (cep->enhanced_rdma_conn_est) { event.private_data_len -= sizeof(struct mpa_v2_data); event.private_data += sizeof(struct mpa_v2_data); } } getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } static void siw_free_cm_id(struct siw_cep *cep) { if (!cep->cm_id) return; cep->cm_id->rem_ref(cep->cm_id); cep->cm_id = NULL; } static void siw_destroy_cep_sock(struct siw_cep *cep) { if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } } /* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ). */ void siw_qp_cm_drop(struct siw_qp *qp, int schedule) { struct siw_cep *cep = qp->cep; qp->rx_stream.rx_suspend = 1; qp->tx_ctx.tx_suspend = 1; if (!qp->cep) return; if (schedule) { siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); } else { siw_cep_set_inuse(cep); if (cep->state == SIW_EPSTATE_CLOSED) { siw_dbg_cep(cep, "already closed\n"); goto out; } siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); siw_send_terminate(qp); if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); break; case SIW_EPSTATE_RDMA_MODE: siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break; case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break; } siw_free_cm_id(cep); siw_cep_put(cep); } cep->state = SIW_EPSTATE_CLOSED; siw_destroy_cep_sock(cep); if (cep->qp) { cep->qp = NULL; siw_qp_put(qp); } out: siw_cep_set_free(cep); } } void siw_cep_put(struct siw_cep *cep) { WARN_ON(kref_read(&cep->ref) < 1); kref_put(&cep->ref, __siw_cep_dealloc); } static void siw_cep_set_free_and_put(struct siw_cep *cep) { siw_cep_set_free(cep); siw_cep_put(cep); } void siw_cep_get(struct siw_cep *cep) { kref_get(&cep->ref); } /* * Expects params->pd_len in host byte order */ static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) { struct socket *s = cep->sock; struct mpa_rr *rr = &cep->mpa.hdr; struct kvec iov[3]; struct msghdr msg; int rv; int iovec_num = 0; int mpa_len; memset(&msg, 0, sizeof(msg)); iov[iovec_num].iov_base = rr; iov[iovec_num].iov_len = sizeof(*rr); mpa_len = sizeof(*rr); if (cep->enhanced_rdma_conn_est) { iovec_num++; iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); mpa_len += sizeof(cep->mpa.v2_ctrl); } if (pd_len) { iovec_num++; iov[iovec_num].iov_base = (char *)pdata; iov[iovec_num].iov_len = pd_len; mpa_len += pd_len; } if (cep->enhanced_rdma_conn_est) pd_len += sizeof(cep->mpa.v2_ctrl); rr->params.pd_len = cpu_to_be16(pd_len); rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); return rv < 0 ? rv : 0; } /* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only */ static int siw_recv_mpa_rr(struct siw_cep *cep) { struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock; u16 pd_len; int rcvd, to_rcv; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 0); if (rcvd <= 0) return -ECONNABORTED; cep->mpa.bytes_rcvd += rcvd; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN; if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO; } pd_len = be16_to_cpu(hdr->params.pd_len); /* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data. */ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation. */ u32 word; rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); if (rcvd == -EAGAIN) return 0; if (rcvd == 0) { siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE; } if (rcvd < 0) { siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd; } siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); return -EPROTO; } /* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0. */ if (!cep->mpa.pdata) { cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM; } rcvd = ksock_recv( s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT); if (rcvd < 0) return rcvd; if (rcvd > to_rcv) return -EPROTO; cep->mpa.bytes_rcvd += rcvd; if (to_rcv == rcvd) { siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0; } return -EAGAIN; } /* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP. */ static int siw_proc_mpareq(struct siw_cep *cep) { struct mpa_rr *req; int version, rv; u16 pd_len; rv = siw_recv_mpa_rr(cep); if (rv) return rv; req = &cep->mpa.hdr; version = __mpa_rr_revision(req->params.bits); pd_len = be16_to_cpu(req->params.pd_len); if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO; if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO; /* Prepare for sending MPA reply */ memcpy(req->key, MPA_KEY_REP, 16); if (version == MPA_REVISION_2 && (req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set. */ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn; cep->enhanced_rdma_conn_est = true; } /* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn; if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter. */ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn; /* Enable CRC if requested by module parameter */ if (mpa_crc_required) req->params.bits |= MPA_RR_FLAG_CRC; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; /* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values. */ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; cep->ord = min(cep->ord, SIW_MAX_ORD_QP); cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; cep->ird = min(cep->ird, SIW_MAX_IRD_QP); /* May get overwritten by locally negotiated values */ cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); /* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated. */ if (v2->ird & MPA_V2_PEER_TO_PEER) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; if (v2->ord & MPA_V2_RDMA_WRITE_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; else if (v2->ord & MPA_V2_RDMA_READ_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; else cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; } } cep->state = SIW_EPSTATE_RECVD_MPAREQ; /* Keep reference until IWCM accepts/rejects */ siw_cep_get(cep); rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); if (rv) siw_cep_put(cep); return rv; reject_conn: siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); req->params.bits &= ~MPA_RR_FLAG_MARKERS; req->params.bits |= MPA_RR_FLAG_REJECT; if (!mpa_crc_required && mpa_crc_strict) req->params.bits &= ~MPA_RR_FLAG_CRC; if (pd_len) kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; siw_send_mpareqrep(cep, NULL, 0); return -EOPNOTSUPP; } static int siw_proc_mpareply(struct siw_cep *cep) { struct siw_qp_attrs qp_attrs; enum siw_qp_attr_mask qp_attr_mask; struct siw_qp *qp = cep->qp; struct mpa_rr *rep; int rv; u16 rep_ord; u16 rep_ird; bool ird_insufficient = false; enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); if (rv) goto out_err; siw_cancel_mpatimer(cep); rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { /* allow for 0, 1, and 2 only */ rv = -EPROTO; goto out_err; } if (memcmp(rep->key, MPA_KEY_REP, 16)) { siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INVALID_REQ_RESP, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } if (rep->params.bits & MPA_RR_FLAG_REJECT) { siw_dbg_cep(cep, "got mpa reject\n"); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -ECONNRESET; } if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || (mpa_crc_strict && !mpa_crc_required && (rep->params.bits & MPA_RR_FLAG_CRC))) { siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); return -EINVAL; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2; if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. */ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", __mpa_rr_revision(rep->params.bits), rep->params.bits & MPA_RR_FLAG_ENHANCED ? 1 : 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -EINVAL; } v2 = (struct mpa_v2_data *)cep->mpa.pdata; rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; if (cep->ird < rep_ord && (relaxed_ird_negotiation == false || rep_ord > cep->sdev->attrs.max_ird)) { siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", cep->ird, rep_ord, cep->sdev->attrs.max_ord); ird_insufficient = true; } if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, rep_ird); ird_insufficient = true; } /* * Always report negotiated peer values to user, * even if IRD/ORD negotiation failed */ cep->ird = rep_ord; cep->ord = rep_ird; if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM. */ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INSUFFICIENT_IRD, 0); siw_send_terminate(qp); rv = -ENOMEM; goto out_err; } if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) mpa_p2p_mode = cep->mpa.v2_ctrl_req.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); /* * Check if we requested P2P mode, and if peer agrees */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support. */ siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n", mpa_p2p_mode, v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_NO_MATCHING_RTR, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); } } memset(&qp_attrs, 0, sizeof(qp_attrs)); if (rep->params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.irq_size = cep->ird; qp_attrs.orq_size = cep->ord; qp_attrs.sk = cep->sock; qp_attrs.state = SIW_QP_STATE_RTS; qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; /* Move socket RX/TX under QP control */ down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) { rv = -EINVAL; up_write(&qp->state_lock); goto out_err; } rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); siw_qp_socket_assoc(cep, qp); up_write(&qp->state_lock); /* Send extra RDMA frame to trigger peer RTS if negotiated */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); if (rv) goto out_err; } if (!rv) { rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); if (!rv) cep->state = SIW_EPSTATE_RDMA_MODE; return 0; } out_err: if (rv != -EAGAIN) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } /* * siw_accept_newconn - accept an incoming pending connection * */ static void siw_accept_newconn(struct siw_cep *cep) { struct socket *s = cep->sock; struct socket *new_s = NULL; struct siw_cep *new_cep = NULL; int rv = 0; /* debug only. should disappear */ if (cep->state != SIW_EPSTATE_LISTENING) goto error; new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error; /* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP */ new_cep->sk_state_change = cep->sk_state_change; new_cep->sk_data_ready = cep->sk_data_ready; new_cep->sk_write_space = cep->sk_write_space; new_cep->sk_error_report = cep->sk_error_report; rv = kernel_accept(s, &new_s, O_NONBLOCK); if (rv != 0) { /* * Connection already aborted by peer..? */ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); goto error; } new_cep->sock = new_s; siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; if (siw_tcp_nagle == false) tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. */ new_cep->listen_cep = cep; siw_cep_get(cep); if (atomic_read(&new_s->sk->sk_rmem_alloc)) { /* * MPA REQ already queued */ siw_dbg_cep(cep, "immediate mpa request\n"); siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } } siw_cep_set_free(new_cep); } return; error: if (new_cep) siw_cep_put(new_cep); if (new_s) { siw_socket_disassoc(new_s); sock_release(new_s); new_cep->sock = NULL; } siw_dbg_cep(cep, "error %d\n", rv); } static void siw_cm_work_handler(struct work_struct *w) { struct siw_cm_work *work; struct siw_cep *cep; int release_cep = 0, rv = 0; work = container_of(w, struct siw_cm_work, work.work); cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, work->type, cep->state); siw_cep_set_inuse(cep); switch (work->type) { case SIW_CM_WORK_ACCEPT: siw_accept_newconn(cep); break; case SIW_CM_WORK_READ_MPAHDR: if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { if (cep->listen_cep) { siw_cep_set_inuse(cep->listen_cep); if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) rv = siw_proc_mpareq(cep); else rv = -EFAULT; siw_cep_set_free(cep->listen_cep); if (rv != -EAGAIN) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; if (rv) siw_cep_put(cep); } } } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { rv = siw_proc_mpareply(cep); } else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet. */ if (cep->state == SIW_EPSTATE_RDMA_MODE) { cep->sock->sk->sk_data_ready(cep->sock->sk); siw_dbg_cep(cep, "already in RDMA mode"); } else { siw_dbg_cep(cep, "out of state: %d\n", cep->state); } } if (rv && rv != -EAGAIN) release_cep = 1; break; case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close */ if (cep->qp) siw_send_terminate(cep->qp); if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); release_cep = 1; break; case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop */ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING. */ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); } /* * for other states there is no connection * known to the IWCM. */ } else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject */ siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n"); } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received. */ if (cep->listen_cep) { siw_dbg_cep(cep, "no mpareq: drop listener\n"); siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } } } release_cep = 1; break; case SIW_CM_WORK_MPATIMEOUT: cep->mpa_timer = NULL; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout */ cep->mpa.hdr.params.pd_len = 0; if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ETIMEDOUT); release_cep = 1; } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup. */ if (cep->listen_cep) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } release_cep = 1; } break; default: WARN(1, "Undefined CM work type: %d\n", work->type); } if (release_cep) { siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); cep->state = SIW_EPSTATE_CLOSED; if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop() */ siw_qp_get(qp); siw_cep_set_free(cep); siw_qp_llp_close(qp); siw_qp_put(qp); siw_cep_set_inuse(cep); cep->qp = NULL; siw_qp_put(qp); } if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } if (cep->cm_id) { siw_free_cm_id(cep); siw_cep_put(cep); } } siw_cep_set_free(cep); siw_put_work(work); siw_cep_put(cep); } static struct workqueue_struct *siw_cm_wq; int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) { struct siw_cm_work *work = siw_get_work(cep); unsigned long delay = 0; if (!work) { siw_dbg_cep(cep, "failed with no work available\n"); return -ENOMEM; } work->type = type; work->cep = cep; siw_cep_get(cep); INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); if (type == SIW_CM_WORK_MPATIMEOUT) { cep->mpa_timer = work; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) delay = MPAREQ_TIMEOUT; else delay = MPAREP_TIMEOUT; } siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); return 0; } static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) goto out; siw_dbg_cep(cep, "cep state: %d, socket state %d\n", cep->state, sk->sk_state); if (sk->sk_state != TCP_ESTABLISHED) goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: case SIW_EPSTATE_LISTENING: break; case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); break; default: siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); break; } out: read_unlock(&sk->sk_callback_lock); } static void siw_cm_llp_write_space(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) siw_dbg_cep(cep, "state: %d\n", cep->state); } static void siw_cm_llp_error_report(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) { siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", sk->sk_err, sk->sk_state, cep->state); cep->sk_error_report(sk); } } static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { /* endpoint already disassociated */ read_unlock(&sk->sk_callback_lock); return; } orig_state_change = cep->sk_state_change; siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible */ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break; case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp) cep->qp->tx_ctx.tx_suspend = 1; siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break; default: siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); } read_unlock(&sk->sk_callback_lock); orig_state_change(sk); } static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr, bool afonly) { int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ sock_set_reuseaddr(s->sk); if (afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) return rv; } rv = s->ops->bind(s, laddr, size); if (rv < 0) return rv; rv = s->ops->connect(s, raddr, size, flags); return rv < 0 ? rv : 0; } int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_qp *qp; struct siw_cep *cep = NULL; struct socket *s = NULL; struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, *raddr = (struct sockaddr *)&id->remote_addr; bool p2p_mode = peer_to_peer, v4 = true; u16 pd_len = params->private_data_len; int version = mpa_version, rv; if (pd_len > MPA_MAX_PRIVDATA) return -EINVAL; if (params->ird > sdev->attrs.max_ird || params->ord > sdev->attrs.max_ord) return -ENOMEM; if (laddr->sa_family == AF_INET6) v4 = false; else if (laddr->sa_family != AF_INET) return -EAFNOSUPPORT; /* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack. */ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || to_sockaddr_in6(id->remote_addr).sin6_port != 0) raddr = (struct sockaddr *)&id->m_remote_addr; qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %u] does not exist\n", params->qpn); rv = -EINVAL; goto error; } siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, raddr); rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; /* * NOTE: For simplification, connect() is called in blocking * mode. Might be reconsidered for async connection setup at * TCP level. */ rv = kernel_bindconnect(s, laddr, raddr, id->afonly); if (rv != 0) { siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } if (siw_tcp_nagle == false) tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_set_inuse(cep); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; id->add_ref(id); cep->cm_id = id; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ rv = siw_cm_alloc_work(cep, 4); if (rv != 0) { rv = -ENOMEM; goto error; } cep->ird = params->ird; cep->ord = params->ord; if (p2p_mode && cep->ord == 0) cep->ord = 1; cep->state = SIW_EPSTATE_CONNECTING; /* * Associate CEP with socket */ siw_cep_socket_assoc(cep, s); cep->state = SIW_EPSTATE_AWAIT_MPAREP; /* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. */ cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) { pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); version = MPA_REVISION_2; /* Adjust also module parameter */ mpa_version = MPA_REVISION_2; } __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); if (try_gso) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; if (mpa_crc_required) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; /* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'. */ if (version == MPA_REVISION_2) { cep->enhanced_rdma_conn_est = true; cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); if (p2p_mode) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; cep->mpa.v2_ctrl.ord |= rtr_type; } /* Remember own P2P mode requested */ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; } memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); rv = siw_send_mpareqrep(cep, params->private_data, pd_len); /* * Reset private data. */ cep->mpa.hdr.params.pd_len = 0; if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } } error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); sock_release(s); cep->sock = NULL; cep->qp = NULL; cep->cm_id = NULL; id->rem_ref(id); qp->cep = NULL; siw_cep_put(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } else if (s) { sock_release(s); } if (qp) siw_qp_put(qp); return rv; } /* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready. */ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false; siw_cep_set_inuse(cep); siw_cep_put(cep); /* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) { cep->mpa.hdr.params.pd_len = 0; kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; } siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); rv = -ECONNRESET; goto free_cep; } qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep; } down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock; siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if (params->ord > sdev->attrs.max_ord || params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, "[QP %u]: ord %d (max %d), ird %d (max %d)\n", qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); goto error_unlock; } if (cep->enhanced_rdma_conn_est) max_priv_data -= sizeof(struct mpa_v2_data); if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, "[QP %u]: private data length: %d (max %d)\n", qp_id(qp), params->private_data_len, max_priv_data); goto error_unlock; } if (cep->enhanced_rdma_conn_est) { if (params->ord > cep->ord) { if (relaxed_ird_negotiation) { params->ord = cep->ord; } else { cep->ird = params->ird; cep->ord = params->ord; goto error_unlock; } } if (params->ird < cep->ird) { if (relaxed_ird_negotiation && cep->ird <= sdev->attrs.max_ird) params->ird = cep->ird; else { rv = -ENOMEM; goto error_unlock; } } if (cep->mpa.v2_ctrl.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) wait_for_peer_rts = true; /* * Signal back negotiated IRD and ORD values */ cep->mpa.v2_ctrl.ord = htons(params->ord & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); cep->mpa.v2_ctrl.ird = htons(params->ird & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); } cep->ird = params->ird; cep->ord = params->ord; cep->cm_id = id; id->add_ref(id); memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.orq_size = cep->ord; qp_attrs.irq_size = cep->ird; qp_attrs.sk = cep->sock; if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; cep->state = SIW_EPSTATE_RDMA_MODE; /* Move socket RX/TX under QP control */ rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA); up_write(&qp->state_lock); if (rv) goto error; siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); if (rv != 0) goto error; if (wait_for_peer_rts) { siw_sk_assign_rtr_upcalls(cep); } else { siw_qp_socket_assoc(cep, qp); rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); if (rv) goto error; } siw_cep_set_free(cep); return 0; error_unlock: up_write(&qp->state_lock); error: siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_free_cm_id(cep); if (qp->cep) { siw_cep_put(cep); qp->cep = NULL; } cep->qp = NULL; siw_qp_put(qp); free_cep: siw_cep_set_free_and_put(cep); return rv; } /* * siw_reject() * * Local connection reject case. Send private data back to peer, * close connection and dereference connection id. */ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) { struct siw_cep *cep = (struct siw_cep *)id->provider_data; siw_cep_set_inuse(cep); siw_cep_put(cep); siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free_and_put(cep); /* put last reference */ return -ECONNRESET; } siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ siw_send_mpareqrep(cep, pdata, pd_len); } siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); return 0; } /* * siw_create_listen - Create resources for a listener's IWCM ID @id * * Starts listen on the socket address id->local_addr. * */ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; /* * Allow binding local port when still in TIME_WAIT from last close. */ sock_set_reuseaddr(s->sk); if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); if (id->afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) { siw_dbg(id->device, "ip6_sock_set_v6only erro: %d\n", rv); goto error; } } /* For wildcard addr, limit binding to current device only */ if (ipv6_addr_any(&laddr->sin6_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_socket_assoc(cep, s); rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; id->add_ref(id); /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. * */ if (!id->provider_data) { id->provider_data = kmalloc(sizeof(struct list_head), GFP_KERNEL); if (!id->provider_data) { rv = -ENOMEM; goto error; } INIT_LIST_HEAD((struct list_head *)id->provider_data); } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); return 0; error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_cep_set_inuse(cep); siw_free_cm_id(cep); cep->sock = NULL; siw_socket_disassoc(s); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } sock_release(s); dev_put(ndev); return rv; } static void siw_drop_listeners(struct iw_cm_id *id) { struct list_head *p, *tmp; /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. */ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); list_del(p); siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); siw_free_cm_id(cep); if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } } int siw_destroy_listen(struct iw_cm_id *id) { if (!id->provider_data) { siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); kfree(id->provider_data); id->provider_data = NULL; return 0; } int siw_cm_init(void) { /* * create_single_workqueue for strict ordering */ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); if (!siw_cm_wq) return -ENOMEM; return 0; } void siw_cm_exit(void) { if (siw_cm_wq) destroy_workqueue(siw_cm_wq); } |
17 82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | // SPDX-License-Identifier: GPL-2.0-only /* * This file provides /sys/class/ieee80211/<wiphy name>/ * and some default attributes. * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include <linux/device.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "sysfs.h" #include "core.h" #include "rdev-ops.h" static inline struct cfg80211_registered_device *dev_to_rdev( struct device *dev) { return container_of(dev, struct cfg80211_registered_device, wiphy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; char *start = buf; int i; if (!wiphy->addresses) return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } static DEVICE_ATTR_RO(addresses); static struct attribute *ieee80211_attrs[] = { &dev_attr_index.attr, &dev_attr_macaddress.attr, &dev_attr_address_mask.attr, &dev_attr_addresses.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(ieee80211); static void wiphy_dev_release(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); cfg80211_dev_free(rdev); } #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } cfg80211_process_wiphy_works(rdev, NULL); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); cfg80211_process_wiphy_works(rdev, NULL); ret = rdev_suspend(rdev, NULL); } if (ret == 0) rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; } static int wiphy_resume(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; /* Age scan results with time spent in suspend */ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return ret; } static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS (&wiphy_pm_ops) #else #define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); return wiphy_net(wiphy); } struct class ieee80211_class = { .name = "ieee80211", .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) { return class_register(&ieee80211_class); } void wiphy_sysfs_exit(void) { class_unregister(&ieee80211_class); } |
80 80 79 78 26 26 64 64 64 64 26 26 26 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | /* * net/tipc/subscr.c: TIPC network topology service * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" static void tipc_sub_send_event(struct tipc_subscription *sub, struct publication *p, u32 event) { struct tipc_subscr *s = &sub->evt.s; struct tipc_event *evt = &sub->evt; if (sub->inactive) return; tipc_evt_write(evt, event, event); if (p) { tipc_evt_write(evt, found_lower, p->sr.lower); tipc_evt_write(evt, found_upper, p->sr.upper); tipc_evt_write(evt, port.ref, p->sk.ref); tipc_evt_write(evt, port.node, p->sk.node); } else { tipc_evt_write(evt, found_lower, s->seq.lower); tipc_evt_write(evt, found_upper, s->seq.upper); tipc_evt_write(evt, port.ref, 0); tipc_evt_write(evt, port.node, 0); } tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ static bool tipc_sub_check_overlap(struct tipc_service_range *subscribed, struct tipc_service_range *found) { u32 found_lower = found->lower; u32 found_upper = found->upper; if (found_lower < subscribed->lower) found_lower = subscribed->lower; if (found_upper > subscribed->upper) found_upper = subscribed->upper; return found_lower <= found_upper; } void tipc_sub_report_overlap(struct tipc_subscription *sub, struct publication *p, u32 event, bool must) { struct tipc_service_range *sr = &sub->s.seq; u32 filter = sub->s.filter; if (!tipc_sub_check_overlap(sr, &p->sr)) return; if (!must && !(filter & TIPC_SUB_PORTS)) return; if (filter & TIPC_SUB_CLUSTER_SCOPE && p->scope == TIPC_NODE_SCOPE) return; if (filter & TIPC_SUB_NODE_SCOPE && p->scope != TIPC_NODE_SCOPE) return; spin_lock(&sub->lock); tipc_sub_send_event(sub, p, event); spin_unlock(&sub->lock); } static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); spin_lock(&sub->lock); tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT); sub->inactive = true; spin_unlock(&sub->lock); } static void tipc_sub_kref_release(struct kref *kref) { kfree(container_of(kref, struct tipc_subscription, kref)); } void tipc_sub_put(struct tipc_subscription *subscription) { kref_put(&subscription->kref, tipc_sub_kref_release); } void tipc_sub_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid) { u32 lower = tipc_sub_read(s, seq.lower); u32 upper = tipc_sub_read(s, seq.upper); u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; u32 timeout; if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || lower > upper) { pr_warn("Subscription rejected, illegal request\n"); return NULL; } sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; } INIT_LIST_HEAD(&sub->service_list); INIT_LIST_HEAD(&sub->sub_list); sub->net = net; sub->conid = conid; sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); sub->s.seq.type = tipc_sub_read(s, seq.type); sub->s.seq.lower = lower; sub->s.seq.upper = upper; sub->s.filter = filter; sub->s.timeout = tipc_sub_read(s, timeout); memcpy(sub->s.usr_handle, s->usr_handle, 8); spin_lock_init(&sub->lock); kref_init(&sub->kref); if (!tipc_nametbl_subscribe(sub)) { kfree(sub); return NULL; } timer_setup(&sub->timer, tipc_sub_timeout, 0); timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); return sub; } void tipc_sub_unsubscribe(struct tipc_subscription *sub) { tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) del_timer_sync(&sub->timer); list_del(&sub->sub_list); tipc_sub_put(sub); } |
6 6 6 2 2 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * DES & Triple DES EDE Cipher Algorithms. * * Copyright (c) 2005 Dag Arne Osvik <da@osvik.no> */ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> #include <linux/unaligned.h> #include <crypto/des.h> #include <crypto/internal/des.h> #define ROL(x, r) ((x) = rol32((x), (r))) #define ROR(x, r) ((x) = ror32((x), (r))) /* Lookup tables for key expansion */ static const u8 pc1[256] = { 0x00, 0x00, 0x40, 0x04, 0x10, 0x10, 0x50, 0x14, 0x04, 0x40, 0x44, 0x44, 0x14, 0x50, 0x54, 0x54, 0x02, 0x02, 0x42, 0x06, 0x12, 0x12, 0x52, 0x16, 0x06, 0x42, 0x46, 0x46, 0x16, 0x52, 0x56, 0x56, 0x80, 0x08, 0xc0, 0x0c, 0x90, 0x18, 0xd0, 0x1c, 0x84, 0x48, 0xc4, 0x4c, 0x94, 0x58, 0xd4, 0x5c, 0x82, 0x0a, 0xc2, 0x0e, 0x92, 0x1a, 0xd2, 0x1e, 0x86, 0x4a, 0xc6, 0x4e, 0x96, 0x5a, 0xd6, 0x5e, 0x20, 0x20, 0x60, 0x24, 0x30, 0x30, 0x70, 0x34, 0x24, 0x60, 0x64, 0x64, 0x34, 0x70, 0x74, 0x74, 0x22, 0x22, 0x62, 0x26, 0x32, 0x32, 0x72, 0x36, 0x26, 0x62, 0x66, 0x66, 0x36, 0x72, 0x76, 0x76, 0xa0, 0x28, 0xe0, 0x2c, 0xb0, 0x38, 0xf0, 0x3c, 0xa4, 0x68, 0xe4, 0x6c, 0xb4, 0x78, 0xf4, 0x7c, 0xa2, 0x2a, 0xe2, 0x2e, 0xb2, 0x3a, 0xf2, 0x3e, 0xa6, 0x6a, 0xe6, 0x6e, 0xb6, 0x7a, 0xf6, 0x7e, 0x08, 0x80, 0x48, 0x84, 0x18, 0x90, 0x58, 0x94, 0x0c, 0xc0, 0x4c, 0xc4, 0x1c, 0xd0, 0x5c, 0xd4, 0x0a, 0x82, 0x4a, 0x86, 0x1a, 0x92, 0x5a, 0x96, 0x0e, 0xc2, 0x4e, 0xc6, 0x1e, 0xd2, 0x5e, 0xd6, 0x88, 0x88, 0xc8, 0x8c, 0x98, 0x98, 0xd8, 0x9c, 0x8c, 0xc8, 0xcc, 0xcc, 0x9c, 0xd8, 0xdc, 0xdc, 0x8a, 0x8a, 0xca, 0x8e, 0x9a, 0x9a, 0xda, 0x9e, 0x8e, 0xca, 0xce, 0xce, 0x9e, 0xda, 0xde, 0xde, 0x28, 0xa0, 0x68, 0xa4, 0x38, 0xb0, 0x78, 0xb4, 0x2c, 0xe0, 0x6c, 0xe4, 0x3c, 0xf0, 0x7c, 0xf4, 0x2a, 0xa2, 0x6a, 0xa6, 0x3a, 0xb2, 0x7a, 0xb6, 0x2e, 0xe2, 0x6e, 0xe6, 0x3e, 0xf2, 0x7e, 0xf6, 0xa8, 0xa8, 0xe8, 0xac, 0xb8, 0xb8, 0xf8, 0xbc, 0xac, 0xe8, 0xec, 0xec, 0xbc, 0xf8, 0xfc, 0xfc, 0xaa, 0xaa, 0xea, 0xae, 0xba, 0xba, 0xfa, 0xbe, 0xae, 0xea, 0xee, 0xee, 0xbe, 0xfa, 0xfe, 0xfe }; static const u8 rs[256] = { 0x00, 0x00, 0x80, 0x80, 0x02, 0x02, 0x82, 0x82, 0x04, 0x04, 0x84, 0x84, 0x06, 0x06, 0x86, 0x86, 0x08, 0x08, 0x88, 0x88, 0x0a, 0x0a, 0x8a, 0x8a, 0x0c, 0x0c, 0x8c, 0x8c, 0x0e, 0x0e, 0x8e, 0x8e, 0x10, 0x10, 0x90, 0x90, 0x12, 0x12, 0x92, 0x92, 0x14, 0x14, 0x94, 0x94, 0x16, 0x16, 0x96, 0x96, 0x18, 0x18, 0x98, 0x98, 0x1a, 0x1a, 0x9a, 0x9a, 0x1c, 0x1c, 0x9c, 0x9c, 0x1e, 0x1e, 0x9e, 0x9e, 0x20, 0x20, 0xa0, 0xa0, 0x22, 0x22, 0xa2, 0xa2, 0x24, 0x24, 0xa4, 0xa4, 0x26, 0x26, 0xa6, 0xa6, 0x28, 0x28, 0xa8, 0xa8, 0x2a, 0x2a, 0xaa, 0xaa, 0x2c, 0x2c, 0xac, 0xac, 0x2e, 0x2e, 0xae, 0xae, 0x30, 0x30, 0xb0, 0xb0, 0x32, 0x32, 0xb2, 0xb2, 0x34, 0x34, 0xb4, 0xb4, 0x36, 0x36, 0xb6, 0xb6, 0x38, 0x38, 0xb8, 0xb8, 0x3a, 0x3a, 0xba, 0xba, 0x3c, 0x3c, 0xbc, 0xbc, 0x3e, 0x3e, 0xbe, 0xbe, 0x40, 0x40, 0xc0, 0xc0, 0x42, 0x42, 0xc2, 0xc2, 0x44, 0x44, 0xc4, 0xc4, 0x46, 0x46, 0xc6, 0xc6, 0x48, 0x48, 0xc8, 0xc8, 0x4a, 0x4a, 0xca, 0xca, 0x4c, 0x4c, 0xcc, 0xcc, 0x4e, 0x4e, 0xce, 0xce, 0x50, 0x50, 0xd0, 0xd0, 0x52, 0x52, 0xd2, 0xd2, 0x54, 0x54, 0xd4, 0xd4, 0x56, 0x56, 0xd6, 0xd6, 0x58, 0x58, 0xd8, 0xd8, 0x5a, 0x5a, 0xda, 0xda, 0x5c, 0x5c, 0xdc, 0xdc, 0x5e, 0x5e, 0xde, 0xde, 0x60, 0x60, 0xe0, 0xe0, 0x62, 0x62, 0xe2, 0xe2, 0x64, 0x64, 0xe4, 0xe4, 0x66, 0x66, 0xe6, 0xe6, 0x68, 0x68, 0xe8, 0xe8, 0x6a, 0x6a, 0xea, 0xea, 0x6c, 0x6c, 0xec, 0xec, 0x6e, 0x6e, 0xee, 0xee, 0x70, 0x70, 0xf0, 0xf0, 0x72, 0x72, 0xf2, 0xf2, 0x74, 0x74, 0xf4, 0xf4, 0x76, 0x76, 0xf6, 0xf6, 0x78, 0x78, 0xf8, 0xf8, 0x7a, 0x7a, 0xfa, 0xfa, 0x7c, 0x7c, 0xfc, 0xfc, 0x7e, 0x7e, 0xfe, 0xfe }; static const u32 pc2[1024] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00040000, 0x00000000, 0x04000000, 0x00100000, 0x00400000, 0x00000008, 0x00000800, 0x40000000, 0x00440000, 0x00000008, 0x04000800, 0x40100000, 0x00000400, 0x00000020, 0x08000000, 0x00000100, 0x00040400, 0x00000020, 0x0c000000, 0x00100100, 0x00400400, 0x00000028, 0x08000800, 0x40000100, 0x00440400, 0x00000028, 0x0c000800, 0x40100100, 0x80000000, 0x00000010, 0x00000000, 0x00800000, 0x80040000, 0x00000010, 0x04000000, 0x00900000, 0x80400000, 0x00000018, 0x00000800, 0x40800000, 0x80440000, 0x00000018, 0x04000800, 0x40900000, 0x80000400, 0x00000030, 0x08000000, 0x00800100, 0x80040400, 0x00000030, 0x0c000000, 0x00900100, 0x80400400, 0x00000038, 0x08000800, 0x40800100, 0x80440400, 0x00000038, 0x0c000800, 0x40900100, 0x10000000, 0x00000000, 0x00200000, 0x00001000, 0x10040000, 0x00000000, 0x04200000, 0x00101000, 0x10400000, 0x00000008, 0x00200800, 0x40001000, 0x10440000, 0x00000008, 0x04200800, 0x40101000, 0x10000400, 0x00000020, 0x08200000, 0x00001100, 0x10040400, 0x00000020, 0x0c200000, 0x00101100, 0x10400400, 0x00000028, 0x08200800, 0x40001100, 0x10440400, 0x00000028, 0x0c200800, 0x40101100, 0x90000000, 0x00000010, 0x00200000, 0x00801000, 0x90040000, 0x00000010, 0x04200000, 0x00901000, 0x90400000, 0x00000018, 0x00200800, 0x40801000, 0x90440000, 0x00000018, 0x04200800, 0x40901000, 0x90000400, 0x00000030, 0x08200000, 0x00801100, 0x90040400, 0x00000030, 0x0c200000, 0x00901100, 0x90400400, 0x00000038, 0x08200800, 0x40801100, 0x90440400, 0x00000038, 0x0c200800, 0x40901100, 0x00000200, 0x00080000, 0x00000000, 0x00000004, 0x00040200, 0x00080000, 0x04000000, 0x00100004, 0x00400200, 0x00080008, 0x00000800, 0x40000004, 0x00440200, 0x00080008, 0x04000800, 0x40100004, 0x00000600, 0x00080020, 0x08000000, 0x00000104, 0x00040600, 0x00080020, 0x0c000000, 0x00100104, 0x00400600, 0x00080028, 0x08000800, 0x40000104, 0x00440600, 0x00080028, 0x0c000800, 0x40100104, 0x80000200, 0x00080010, 0x00000000, 0x00800004, 0x80040200, 0x00080010, 0x04000000, 0x00900004, 0x80400200, 0x00080018, 0x00000800, 0x40800004, 0x80440200, 0x00080018, 0x04000800, 0x40900004, 0x80000600, 0x00080030, 0x08000000, 0x00800104, 0x80040600, 0x00080030, 0x0c000000, 0x00900104, 0x80400600, 0x00080038, 0x08000800, 0x40800104, 0x80440600, 0x00080038, 0x0c000800, 0x40900104, 0x10000200, 0x00080000, 0x00200000, 0x00001004, 0x10040200, 0x00080000, 0x04200000, 0x00101004, 0x10400200, 0x00080008, 0x00200800, 0x40001004, 0x10440200, 0x00080008, 0x04200800, 0x40101004, 0x10000600, 0x00080020, 0x08200000, 0x00001104, 0x10040600, 0x00080020, 0x0c200000, 0x00101104, 0x10400600, 0x00080028, 0x08200800, 0x40001104, 0x10440600, 0x00080028, 0x0c200800, 0x40101104, 0x90000200, 0x00080010, 0x00200000, 0x00801004, 0x90040200, 0x00080010, 0x04200000, 0x00901004, 0x90400200, 0x00080018, 0x00200800, 0x40801004, 0x90440200, 0x00080018, 0x04200800, 0x40901004, 0x90000600, 0x00080030, 0x08200000, 0x00801104, 0x90040600, 0x00080030, 0x0c200000, 0x00901104, 0x90400600, 0x00080038, 0x08200800, 0x40801104, 0x90440600, 0x00080038, 0x0c200800, 0x40901104, 0x00000002, 0x00002000, 0x20000000, 0x00000001, 0x00040002, 0x00002000, 0x24000000, 0x00100001, 0x00400002, 0x00002008, 0x20000800, 0x40000001, 0x00440002, 0x00002008, 0x24000800, 0x40100001, 0x00000402, 0x00002020, 0x28000000, 0x00000101, 0x00040402, 0x00002020, 0x2c000000, 0x00100101, 0x00400402, 0x00002028, 0x28000800, 0x40000101, 0x00440402, 0x00002028, 0x2c000800, 0x40100101, 0x80000002, 0x00002010, 0x20000000, 0x00800001, 0x80040002, 0x00002010, 0x24000000, 0x00900001, 0x80400002, 0x00002018, 0x20000800, 0x40800001, 0x80440002, 0x00002018, 0x24000800, 0x40900001, 0x80000402, 0x00002030, 0x28000000, 0x00800101, 0x80040402, 0x00002030, 0x2c000000, 0x00900101, 0x80400402, 0x00002038, 0x28000800, 0x40800101, 0x80440402, 0x00002038, 0x2c000800, 0x40900101, 0x10000002, 0x00002000, 0x20200000, 0x00001001, 0x10040002, 0x00002000, 0x24200000, 0x00101001, 0x10400002, 0x00002008, 0x20200800, 0x40001001, 0x10440002, 0x00002008, 0x24200800, 0x40101001, 0x10000402, 0x00002020, 0x28200000, 0x00001101, 0x10040402, 0x00002020, 0x2c200000, 0x00101101, 0x10400402, 0x00002028, 0x28200800, 0x40001101, 0x10440402, 0x00002028, 0x2c200800, 0x40101101, 0x90000002, 0x00002010, 0x20200000, 0x00801001, 0x90040002, 0x00002010, 0x24200000, 0x00901001, 0x90400002, 0x00002018, 0x20200800, 0x40801001, 0x90440002, 0x00002018, 0x24200800, 0x40901001, 0x90000402, 0x00002030, 0x28200000, 0x00801101, 0x90040402, 0x00002030, 0x2c200000, 0x00901101, 0x90400402, 0x00002038, 0x28200800, 0x40801101, 0x90440402, 0x00002038, 0x2c200800, 0x40901101, 0x00000202, 0x00082000, 0x20000000, 0x00000005, 0x00040202, 0x00082000, 0x24000000, 0x00100005, 0x00400202, 0x00082008, 0x20000800, 0x40000005, 0x00440202, 0x00082008, 0x24000800, 0x40100005, 0x00000602, 0x00082020, 0x28000000, 0x00000105, 0x00040602, 0x00082020, 0x2c000000, 0x00100105, 0x00400602, 0x00082028, 0x28000800, 0x40000105, 0x00440602, 0x00082028, 0x2c000800, 0x40100105, 0x80000202, 0x00082010, 0x20000000, 0x00800005, 0x80040202, 0x00082010, 0x24000000, 0x00900005, 0x80400202, 0x00082018, 0x20000800, 0x40800005, 0x80440202, 0x00082018, 0x24000800, 0x40900005, 0x80000602, 0x00082030, 0x28000000, 0x00800105, 0x80040602, 0x00082030, 0x2c000000, 0x00900105, 0x80400602, 0x00082038, 0x28000800, 0x40800105, 0x80440602, 0x00082038, 0x2c000800, 0x40900105, 0x10000202, 0x00082000, 0x20200000, 0x00001005, 0x10040202, 0x00082000, 0x24200000, 0x00101005, 0x10400202, 0x00082008, 0x20200800, 0x40001005, 0x10440202, 0x00082008, 0x24200800, 0x40101005, 0x10000602, 0x00082020, 0x28200000, 0x00001105, 0x10040602, 0x00082020, 0x2c200000, 0x00101105, 0x10400602, 0x00082028, 0x28200800, 0x40001105, 0x10440602, 0x00082028, 0x2c200800, 0x40101105, 0x90000202, 0x00082010, 0x20200000, 0x00801005, 0x90040202, 0x00082010, 0x24200000, 0x00901005, 0x90400202, 0x00082018, 0x20200800, 0x40801005, 0x90440202, 0x00082018, 0x24200800, 0x40901005, 0x90000602, 0x00082030, 0x28200000, 0x00801105, 0x90040602, 0x00082030, 0x2c200000, 0x00901105, 0x90400602, 0x00082038, 0x28200800, 0x40801105, 0x90440602, 0x00082038, 0x2c200800, 0x40901105, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008, 0x00080000, 0x10000000, 0x02000000, 0x00000000, 0x00000080, 0x00001000, 0x02000000, 0x00000008, 0x00080080, 0x10001000, 0x00004000, 0x00000000, 0x00000040, 0x00040000, 0x00004000, 0x00000008, 0x00080040, 0x10040000, 0x02004000, 0x00000000, 0x000000c0, 0x00041000, 0x02004000, 0x00000008, 0x000800c0, 0x10041000, 0x00020000, 0x00008000, 0x08000000, 0x00200000, 0x00020000, 0x00008008, 0x08080000, 0x10200000, 0x02020000, 0x00008000, 0x08000080, 0x00201000, 0x02020000, 0x00008008, 0x08080080, 0x10201000, 0x00024000, 0x00008000, 0x08000040, 0x00240000, 0x00024000, 0x00008008, 0x08080040, 0x10240000, 0x02024000, 0x00008000, 0x080000c0, 0x00241000, 0x02024000, 0x00008008, 0x080800c0, 0x10241000, 0x00000000, 0x01000000, 0x00002000, 0x00000020, 0x00000000, 0x01000008, 0x00082000, 0x10000020, 0x02000000, 0x01000000, 0x00002080, 0x00001020, 0x02000000, 0x01000008, 0x00082080, 0x10001020, 0x00004000, 0x01000000, 0x00002040, 0x00040020, 0x00004000, 0x01000008, 0x00082040, 0x10040020, 0x02004000, 0x01000000, 0x000020c0, 0x00041020, 0x02004000, 0x01000008, 0x000820c0, 0x10041020, 0x00020000, 0x01008000, 0x08002000, 0x00200020, 0x00020000, 0x01008008, 0x08082000, 0x10200020, 0x02020000, 0x01008000, 0x08002080, 0x00201020, 0x02020000, 0x01008008, 0x08082080, 0x10201020, 0x00024000, 0x01008000, 0x08002040, 0x00240020, 0x00024000, 0x01008008, 0x08082040, 0x10240020, 0x02024000, 0x01008000, 0x080020c0, 0x00241020, 0x02024000, 0x01008008, 0x080820c0, 0x10241020, 0x00000400, 0x04000000, 0x00100000, 0x00000004, 0x00000400, 0x04000008, 0x00180000, 0x10000004, 0x02000400, 0x04000000, 0x00100080, 0x00001004, 0x02000400, 0x04000008, 0x00180080, 0x10001004, 0x00004400, 0x04000000, 0x00100040, 0x00040004, 0x00004400, 0x04000008, 0x00180040, 0x10040004, 0x02004400, 0x04000000, 0x001000c0, 0x00041004, 0x02004400, 0x04000008, 0x001800c0, 0x10041004, 0x00020400, 0x04008000, 0x08100000, 0x00200004, 0x00020400, 0x04008008, 0x08180000, 0x10200004, 0x02020400, 0x04008000, 0x08100080, 0x00201004, 0x02020400, 0x04008008, 0x08180080, 0x10201004, 0x00024400, 0x04008000, 0x08100040, 0x00240004, 0x00024400, 0x04008008, 0x08180040, 0x10240004, 0x02024400, 0x04008000, 0x081000c0, 0x00241004, 0x02024400, 0x04008008, 0x081800c0, 0x10241004, 0x00000400, 0x05000000, 0x00102000, 0x00000024, 0x00000400, 0x05000008, 0x00182000, 0x10000024, 0x02000400, 0x05000000, 0x00102080, 0x00001024, 0x02000400, 0x05000008, 0x00182080, 0x10001024, 0x00004400, 0x05000000, 0x00102040, 0x00040024, 0x00004400, 0x05000008, 0x00182040, 0x10040024, 0x02004400, 0x05000000, 0x001020c0, 0x00041024, 0x02004400, 0x05000008, 0x001820c0, 0x10041024, 0x00020400, 0x05008000, 0x08102000, 0x00200024, 0x00020400, 0x05008008, 0x08182000, 0x10200024, 0x02020400, 0x05008000, 0x08102080, 0x00201024, 0x02020400, 0x05008008, 0x08182080, 0x10201024, 0x00024400, 0x05008000, 0x08102040, 0x00240024, 0x00024400, 0x05008008, 0x08182040, 0x10240024, 0x02024400, 0x05008000, 0x081020c0, 0x00241024, 0x02024400, 0x05008008, 0x081820c0, 0x10241024, 0x00000800, 0x00010000, 0x20000000, 0x00000010, 0x00000800, 0x00010008, 0x20080000, 0x10000010, 0x02000800, 0x00010000, 0x20000080, 0x00001010, 0x02000800, 0x00010008, 0x20080080, 0x10001010, 0x00004800, 0x00010000, 0x20000040, 0x00040010, 0x00004800, 0x00010008, 0x20080040, 0x10040010, 0x02004800, 0x00010000, 0x200000c0, 0x00041010, 0x02004800, 0x00010008, 0x200800c0, 0x10041010, 0x00020800, 0x00018000, 0x28000000, 0x00200010, 0x00020800, 0x00018008, 0x28080000, 0x10200010, 0x02020800, 0x00018000, 0x28000080, 0x00201010, 0x02020800, 0x00018008, 0x28080080, 0x10201010, 0x00024800, 0x00018000, 0x28000040, 0x00240010, 0x00024800, 0x00018008, 0x28080040, 0x10240010, 0x02024800, 0x00018000, 0x280000c0, 0x00241010, 0x02024800, 0x00018008, 0x280800c0, 0x10241010, 0x00000800, 0x01010000, 0x20002000, 0x00000030, 0x00000800, 0x01010008, 0x20082000, 0x10000030, 0x02000800, 0x01010000, 0x20002080, 0x00001030, 0x02000800, 0x01010008, 0x20082080, 0x10001030, 0x00004800, 0x01010000, 0x20002040, 0x00040030, 0x00004800, 0x01010008, 0x20082040, 0x10040030, 0x02004800, 0x01010000, 0x200020c0, 0x00041030, 0x02004800, 0x01010008, 0x200820c0, 0x10041030, 0x00020800, 0x01018000, 0x28002000, 0x00200030, 0x00020800, 0x01018008, 0x28082000, 0x10200030, 0x02020800, 0x01018000, 0x28002080, 0x00201030, 0x02020800, 0x01018008, 0x28082080, 0x10201030, 0x00024800, 0x01018000, 0x28002040, 0x00240030, 0x00024800, 0x01018008, 0x28082040, 0x10240030, 0x02024800, 0x01018000, 0x280020c0, 0x00241030, 0x02024800, 0x01018008, 0x280820c0, 0x10241030, 0x00000c00, 0x04010000, 0x20100000, 0x00000014, 0x00000c00, 0x04010008, 0x20180000, 0x10000014, 0x02000c00, 0x04010000, 0x20100080, 0x00001014, 0x02000c00, 0x04010008, 0x20180080, 0x10001014, 0x00004c00, 0x04010000, 0x20100040, 0x00040014, 0x00004c00, 0x04010008, 0x20180040, 0x10040014, 0x02004c00, 0x04010000, 0x201000c0, 0x00041014, 0x02004c00, 0x04010008, 0x201800c0, 0x10041014, 0x00020c00, 0x04018000, 0x28100000, 0x00200014, 0x00020c00, 0x04018008, 0x28180000, 0x10200014, 0x02020c00, 0x04018000, 0x28100080, 0x00201014, 0x02020c00, 0x04018008, 0x28180080, 0x10201014, 0x00024c00, 0x04018000, 0x28100040, 0x00240014, 0x00024c00, 0x04018008, 0x28180040, 0x10240014, 0x02024c00, 0x04018000, 0x281000c0, 0x00241014, 0x02024c00, 0x04018008, 0x281800c0, 0x10241014, 0x00000c00, 0x05010000, 0x20102000, 0x00000034, 0x00000c00, 0x05010008, 0x20182000, 0x10000034, 0x02000c00, 0x05010000, 0x20102080, 0x00001034, 0x02000c00, 0x05010008, 0x20182080, 0x10001034, 0x00004c00, 0x05010000, 0x20102040, 0x00040034, 0x00004c00, 0x05010008, 0x20182040, 0x10040034, 0x02004c00, 0x05010000, 0x201020c0, 0x00041034, 0x02004c00, 0x05010008, 0x201820c0, 0x10041034, 0x00020c00, 0x05018000, 0x28102000, 0x00200034, 0x00020c00, 0x05018008, 0x28182000, 0x10200034, 0x02020c00, 0x05018000, 0x28102080, 0x00201034, 0x02020c00, 0x05018008, 0x28182080, 0x10201034, 0x00024c00, 0x05018000, 0x28102040, 0x00240034, 0x00024c00, 0x05018008, 0x28182040, 0x10240034, 0x02024c00, 0x05018000, 0x281020c0, 0x00241034, 0x02024c00, 0x05018008, 0x281820c0, 0x10241034 }; /* S-box lookup tables */ static const u32 S1[64] = { 0x01010400, 0x00000000, 0x00010000, 0x01010404, 0x01010004, 0x00010404, 0x00000004, 0x00010000, 0x00000400, 0x01010400, 0x01010404, 0x00000400, 0x01000404, 0x01010004, 0x01000000, 0x00000004, 0x00000404, 0x01000400, 0x01000400, 0x00010400, 0x00010400, 0x01010000, 0x01010000, 0x01000404, 0x00010004, 0x01000004, 0x01000004, 0x00010004, 0x00000000, 0x00000404, 0x00010404, 0x01000000, 0x00010000, 0x01010404, 0x00000004, 0x01010000, 0x01010400, 0x01000000, 0x01000000, 0x00000400, 0x01010004, 0x00010000, 0x00010400, 0x01000004, 0x00000400, 0x00000004, 0x01000404, 0x00010404, 0x01010404, 0x00010004, 0x01010000, 0x01000404, 0x01000004, 0x00000404, 0x00010404, 0x01010400, 0x00000404, 0x01000400, 0x01000400, 0x00000000, 0x00010004, 0x00010400, 0x00000000, 0x01010004 }; static const u32 S2[64] = { 0x80108020, 0x80008000, 0x00008000, 0x00108020, 0x00100000, 0x00000020, 0x80100020, 0x80008020, 0x80000020, 0x80108020, 0x80108000, 0x80000000, 0x80008000, 0x00100000, 0x00000020, 0x80100020, 0x00108000, 0x00100020, 0x80008020, 0x00000000, 0x80000000, 0x00008000, 0x00108020, 0x80100000, 0x00100020, 0x80000020, 0x00000000, 0x00108000, 0x00008020, 0x80108000, 0x80100000, 0x00008020, 0x00000000, 0x00108020, 0x80100020, 0x00100000, 0x80008020, 0x80100000, 0x80108000, 0x00008000, 0x80100000, 0x80008000, 0x00000020, 0x80108020, 0x00108020, 0x00000020, 0x00008000, 0x80000000, 0x00008020, 0x80108000, 0x00100000, 0x80000020, 0x00100020, 0x80008020, 0x80000020, 0x00100020, 0x00108000, 0x00000000, 0x80008000, 0x00008020, 0x80000000, 0x80100020, 0x80108020, 0x00108000 }; static const u32 S3[64] = { 0x00000208, 0x08020200, 0x00000000, 0x08020008, 0x08000200, 0x00000000, 0x00020208, 0x08000200, 0x00020008, 0x08000008, 0x08000008, 0x00020000, 0x08020208, 0x00020008, 0x08020000, 0x00000208, 0x08000000, 0x00000008, 0x08020200, 0x00000200, 0x00020200, 0x08020000, 0x08020008, 0x00020208, 0x08000208, 0x00020200, 0x00020000, 0x08000208, 0x00000008, 0x08020208, 0x00000200, 0x08000000, 0x08020200, 0x08000000, 0x00020008, 0x00000208, 0x00020000, 0x08020200, 0x08000200, 0x00000000, 0x00000200, 0x00020008, 0x08020208, 0x08000200, 0x08000008, 0x00000200, 0x00000000, 0x08020008, 0x08000208, 0x00020000, 0x08000000, 0x08020208, 0x00000008, 0x00020208, 0x00020200, 0x08000008, 0x08020000, 0x08000208, 0x00000208, 0x08020000, 0x00020208, 0x00000008, 0x08020008, 0x00020200 }; static const u32 S4[64] = { 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802080, 0x00800081, 0x00800001, 0x00002001, 0x00000000, 0x00802000, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00800080, 0x00800001, 0x00000001, 0x00002000, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002001, 0x00002080, 0x00800081, 0x00000001, 0x00002080, 0x00800080, 0x00002000, 0x00802080, 0x00802081, 0x00000081, 0x00800080, 0x00800001, 0x00802000, 0x00802081, 0x00000081, 0x00000000, 0x00000000, 0x00802000, 0x00002080, 0x00800080, 0x00800081, 0x00000001, 0x00802001, 0x00002081, 0x00002081, 0x00000080, 0x00802081, 0x00000081, 0x00000001, 0x00002000, 0x00800001, 0x00002001, 0x00802080, 0x00800081, 0x00002001, 0x00002080, 0x00800000, 0x00802001, 0x00000080, 0x00800000, 0x00002000, 0x00802080 }; static const u32 S5[64] = { 0x00000100, 0x02080100, 0x02080000, 0x42000100, 0x00080000, 0x00000100, 0x40000000, 0x02080000, 0x40080100, 0x00080000, 0x02000100, 0x40080100, 0x42000100, 0x42080000, 0x00080100, 0x40000000, 0x02000000, 0x40080000, 0x40080000, 0x00000000, 0x40000100, 0x42080100, 0x42080100, 0x02000100, 0x42080000, 0x40000100, 0x00000000, 0x42000000, 0x02080100, 0x02000000, 0x42000000, 0x00080100, 0x00080000, 0x42000100, 0x00000100, 0x02000000, 0x40000000, 0x02080000, 0x42000100, 0x40080100, 0x02000100, 0x40000000, 0x42080000, 0x02080100, 0x40080100, 0x00000100, 0x02000000, 0x42080000, 0x42080100, 0x00080100, 0x42000000, 0x42080100, 0x02080000, 0x00000000, 0x40080000, 0x42000000, 0x00080100, 0x02000100, 0x40000100, 0x00080000, 0x00000000, 0x40080000, 0x02080100, 0x40000100 }; static const u32 S6[64] = { 0x20000010, 0x20400000, 0x00004000, 0x20404010, 0x20400000, 0x00000010, 0x20404010, 0x00400000, 0x20004000, 0x00404010, 0x00400000, 0x20000010, 0x00400010, 0x20004000, 0x20000000, 0x00004010, 0x00000000, 0x00400010, 0x20004010, 0x00004000, 0x00404000, 0x20004010, 0x00000010, 0x20400010, 0x20400010, 0x00000000, 0x00404010, 0x20404000, 0x00004010, 0x00404000, 0x20404000, 0x20000000, 0x20004000, 0x00000010, 0x20400010, 0x00404000, 0x20404010, 0x00400000, 0x00004010, 0x20000010, 0x00400000, 0x20004000, 0x20000000, 0x00004010, 0x20000010, 0x20404010, 0x00404000, 0x20400000, 0x00404010, 0x20404000, 0x00000000, 0x20400010, 0x00000010, 0x00004000, 0x20400000, 0x00404010, 0x00004000, 0x00400010, 0x20004010, 0x00000000, 0x20404000, 0x20000000, 0x00400010, 0x20004010 }; static const u32 S7[64] = { 0x00200000, 0x04200002, 0x04000802, 0x00000000, 0x00000800, 0x04000802, 0x00200802, 0x04200800, 0x04200802, 0x00200000, 0x00000000, 0x04000002, 0x00000002, 0x04000000, 0x04200002, 0x00000802, 0x04000800, 0x00200802, 0x00200002, 0x04000800, 0x04000002, 0x04200000, 0x04200800, 0x00200002, 0x04200000, 0x00000800, 0x00000802, 0x04200802, 0x00200800, 0x00000002, 0x04000000, 0x00200800, 0x04000000, 0x00200800, 0x00200000, 0x04000802, 0x04000802, 0x04200002, 0x04200002, 0x00000002, 0x00200002, 0x04000000, 0x04000800, 0x00200000, 0x04200800, 0x00000802, 0x00200802, 0x04200800, 0x00000802, 0x04000002, 0x04200802, 0x04200000, 0x00200800, 0x00000000, 0x00000002, 0x04200802, 0x00000000, 0x00200802, 0x04200000, 0x00000800, 0x04000002, 0x04000800, 0x00000800, 0x00200002 }; static const u32 S8[64] = { 0x10001040, 0x00001000, 0x00040000, 0x10041040, 0x10000000, 0x10001040, 0x00000040, 0x10000000, 0x00040040, 0x10040000, 0x10041040, 0x00041000, 0x10041000, 0x00041040, 0x00001000, 0x00000040, 0x10040000, 0x10000040, 0x10001000, 0x00001040, 0x00041000, 0x00040040, 0x10040040, 0x10041000, 0x00001040, 0x00000000, 0x00000000, 0x10040040, 0x10000040, 0x10001000, 0x00041040, 0x00040000, 0x00041040, 0x00040000, 0x10041000, 0x00001000, 0x00000040, 0x10040040, 0x00001000, 0x00041040, 0x10001000, 0x00000040, 0x10000040, 0x10040000, 0x10040040, 0x10000000, 0x00040000, 0x10001040, 0x00000000, 0x10041040, 0x00040040, 0x10000040, 0x10040000, 0x10001000, 0x10001040, 0x00000000, 0x10041040, 0x00041000, 0x00041000, 0x00001040, 0x00001040, 0x00040040, 0x10000000, 0x10041000 }; /* Encryption components: IP, FP, and round function */ #define IP(L, R, T) \ ROL(R, 4); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROL(R, 12); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 14); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 6); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 7); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(L, 1); #define FP(L, R, T) \ ROR(L, 1); \ T = L; \ L ^= R; \ L &= 0xaaaaaaaa; \ R ^= L; \ L ^= T; \ ROL(R, 7); \ T = L; \ L ^= R; \ L &= 0xff00ff00; \ R ^= L; \ L ^= T; \ ROR(R, 6); \ T = L; \ L ^= R; \ L &= 0xcccccccc; \ R ^= L; \ L ^= T; \ ROL(R, 14); \ T = L; \ L ^= R; \ L &= 0xffff0000; \ R ^= L; \ L ^= T; \ ROR(R, 12); \ T = L; \ L ^= R; \ L &= 0xf0f0f0f0; \ R ^= L; \ L ^= T; \ ROR(R, 4); #define ROUND(L, R, A, B, K, d) \ B = K[0]; A = K[1]; K += d; \ B ^= R; A ^= R; \ B &= 0x3f3f3f3f; ROR(A, 4); \ L ^= S8[0xff & B]; A &= 0x3f3f3f3f; \ L ^= S6[0xff & (B >> 8)]; B >>= 16; \ L ^= S7[0xff & A]; \ L ^= S5[0xff & (A >> 8)]; A >>= 16; \ L ^= S4[0xff & B]; \ L ^= S2[0xff & (B >> 8)]; \ L ^= S3[0xff & A]; \ L ^= S1[0xff & (A >> 8)]; /* * PC2 lookup tables are organized as 2 consecutive sets of 4 interleaved * tables of 128 elements. One set is for C_i and the other for D_i, while * the 4 interleaved tables correspond to four 7-bit subsets of C_i or D_i. * * After PC1 each of the variables a,b,c,d contains a 7 bit subset of C_i * or D_i in bits 7-1 (bit 0 being the least significant). */ #define T1(x) pt[2 * (x) + 0] #define T2(x) pt[2 * (x) + 1] #define T3(x) pt[2 * (x) + 2] #define T4(x) pt[2 * (x) + 3] #define DES_PC2(a, b, c, d) (T4(d) | T3(c) | T2(b) | T1(a)) /* * Encryption key expansion * * RFC2451: Weak key checks SHOULD be performed. * * FIPS 74: * * Keys having duals are keys which produce all zeros, all ones, or * alternating zero-one patterns in the C and D registers after Permuted * Choice 1 has operated on the key. * */ static unsigned long des_ekey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d, w; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[15 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 0] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 0] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 0] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 0] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 0] = DES_PC2(b, c, d, a); /* Check if first half is weak */ w = (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; /* Check if second half is weak */ w |= (a ^ c) | (b ^ d) | (rs[a] ^ c) | (b ^ rs[d]); pe[15 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[14 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[13 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[12 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[11 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[10 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 9 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 8 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 7 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 6 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 5 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 4 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 3 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 2 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[ 1 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[ 0 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } /* Zero if weak key */ return w; } int des_expand_key(struct des_ctx *ctx, const u8 *key, unsigned int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return des_ekey(ctx->expkey, key) ? 0 : -ENOKEY; } EXPORT_SYMBOL_GPL(des_expand_key); /* * Decryption key expansion * * No weak key checking is performed, as this is only used by triple DES * */ static void dkey(u32 *pe, const u8 *k) { /* K&R: long is at least 32 bits */ unsigned long a, b, c, d; const u32 *pt = pc2; d = k[4]; d &= 0x0e; d <<= 4; d |= k[0] & 0x1e; d = pc1[d]; c = k[5]; c &= 0x0e; c <<= 4; c |= k[1] & 0x1e; c = pc1[c]; b = k[6]; b &= 0x0e; b <<= 4; b |= k[2] & 0x1e; b = pc1[b]; a = k[7]; a &= 0x0e; a <<= 4; a |= k[3] & 0x1e; a = pc1[a]; pe[ 0 * 2] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2] = DES_PC2(b, c, d, a); /* Skip to next table set */ pt += 512; d = k[0]; d &= 0xe0; d >>= 4; d |= k[4] & 0xf0; d = pc1[d + 1]; c = k[1]; c &= 0xe0; c >>= 4; c |= k[5] & 0xf0; c = pc1[c + 1]; b = k[2]; b &= 0xe0; b >>= 4; b |= k[6] & 0xf0; b = pc1[b + 1]; a = k[3]; a &= 0xe0; a >>= 4; a |= k[7] & 0xf0; a = pc1[a + 1]; pe[ 0 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; pe[ 1 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 2 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 3 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 4 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 5 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; b = rs[b]; pe[ 6 * 2 + 1] = DES_PC2(b, c, d, a); a = rs[a]; d = rs[d]; pe[ 7 * 2 + 1] = DES_PC2(d, a, b, c); c = rs[c]; pe[ 8 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[ 9 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[10 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[11 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[12 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; a = rs[a]; pe[13 * 2 + 1] = DES_PC2(a, b, c, d); d = rs[d]; c = rs[c]; pe[14 * 2 + 1] = DES_PC2(c, d, a, b); b = rs[b]; pe[15 * 2 + 1] = DES_PC2(b, c, d, a); /* Fixup: 2413 5768 -> 1357 2468 */ for (d = 0; d < 16; ++d) { a = pe[2 * d]; b = pe[2 * d + 1]; c = a ^ b; c &= 0xffff0000; a ^= c; b ^= c; ROL(b, 18); pe[2 * d] = a; pe[2 * d + 1] = b; } } void des_encrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_encrypt); void des_decrypt(const struct des_ctx *ctx, u8 *dst, const u8 *src) { const u32 *K = ctx->expkey + DES_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des_decrypt); int des3_ede_expand_key(struct des3_ede_ctx *ctx, const u8 *key, unsigned int keylen) { u32 *pe = ctx->expkey; int err; if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; err = des3_ede_verify_key(key, keylen, true); if (err && err != -ENOKEY) return err; des_ekey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; dkey(pe, key); pe += DES_EXPKEY_WORDS; key += DES_KEY_SIZE; des_ekey(pe, key); return err; } EXPORT_SYMBOL_GPL(des3_ede_expand_key); void des3_ede_encrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, 2); ROUND(L, R, A, B, K, 2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, 2); ROUND(R, L, A, B, K, 2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_encrypt); void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) { const u32 *K = dctx->expkey + DES3_EDE_EXPKEY_WORDS - 2; u32 L, R, A, B; int i; L = get_unaligned_le32(src); R = get_unaligned_le32(src + 4); IP(L, R, A); for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(R, L, A, B, K, -2); ROUND(L, R, A, B, K, -2); } for (i = 0; i < 8; i++) { ROUND(L, R, A, B, K, -2); ROUND(R, L, A, B, K, -2); } FP(R, L, A); put_unaligned_le32(R, dst); put_unaligned_le32(L, dst + 4); } EXPORT_SYMBOL_GPL(des3_ede_decrypt); MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_LICENSE("GPL"); |
1352 529 732 880 306 689 539 539 7 6 7 7 2 2 6 4 2 2 502 2 1317 1317 1316 1318 1318 1318 1317 497 502 501 2 502 500 502 632 1618 1668 1616 1364 1317 106 145 190 191 190 96 123 8 122 26 26 26 2297 1555 1250 20 1588 815 1580 842 436 437 437 435 235 53 231 2603 594 2415 2092 1260 1443 1419 1411 270 1264 1412 545 24 530 5 5 5 5 5 5 5 5 5 5 2165 2167 2163 6 6 2 6 25 21 21 21 4 4 4 4 74 74 5 73 73 73 1420 25 8 1380 1384 6 978 977 127 1 977 71 978 1401 37 985 63 507 20 87 33 32 3 5 27 27 27 27 27 22 15 22 22 2157 2159 2157 2155 47 2158 13 354 286 181 586 585 586 357 260 458 130 9 129 686 686 687 686 106 362 178 178 563 563 287 375 563 25 276 98 62 65 65 94 94 94 94 12 12 20 20 20 20 20 20 20 20 20 20 20 20 20 21 20 2 2 2 2 2 2 2128 2130 2126 2 286 46 41 24 13 24 23 3 6 5 2 2038 16 16 75 76 71 72 1 1 2 2 780 780 23 1541 1513 1358 262 1741 251 2139 239 2079 2079 2082 2 2079 505 2038 2038 2041 2077 60 2041 2080 625 625 625 626 625 626 83 83 59 59 38 39 38 52 52 226 36 67 67 22 48 48 134 229 36 134 69 70 326 518 518 144 379 284 284 284 767 768 769 766 167 769 751 752 752 753 79 63 14 14 14 23 24 22 20 20 2 31 31 30 12776 12775 3 12822 12760 12760 12324 595 317 12565 38 2 73 32 12771 12722 102 12764 12 41 41 321 12608 12815 12723 12765 12710 12763 12719 12721 12765 12767 3 1 555 124 424 17 12767 12767 12762 12815 12779 33 10 24 12728 12327 5 555 556 735 764 19 747 1 12872 47 40 9 46 53 869 544 476 1177 2 361 363 363 2 362 2 907 911 106 105 106 867 868 3 870 443 653 907 32 12902 12914 155 155 155 155 155 41 41 41 15 4 11 4 3 510 507 3 2 1 3 3 3 8247 8239 42 28 15 38 42 4 4 4 4 26 26 26 22 26 26 1 1 26 26 22 24 25 2 12914 89 12884 53 6 58 12915 12915 12922 12914 12861 56 12913 12958 12919 9077 8248 1 1 8268 8252 11937 1282 12917 12928 12961 1177 1 15 12355 12359 1 1 12916 26 1 25 2 23 19 23 23 1142 1138 1140 1138 1145 1076 1074 1075 2 2 1077 1 1 1 1 1 1091 41 1076 1 1077 56 1073 39 45 45 39 45 45 45 45 45 45 30 31 6 45 45 45 6 45 45 336 329 45 45 45 30 30 1089 1092 1095 828 831 134 27 256 11 27 113 253 253 253 247 12 12 164 164 164 499 16 3 19 19 19 1539 1544 1554 1541 1547 1321 334 336 1540 148 1538 302 79 1536 1 4 1310 513 3 513 1536 1429 109 43 144 1533 1530 40 12 49 1538 40 108 6 36 1542 1530 1528 24 52 1497 40 1283 246 143 87 87 244 97 246 60 95 223 223 246 1311 1310 1310 245 246 244 246 35 35 10 21 12 1312 1310 1310 1302 245 245 245 246 246 246 246 246 246 246 1310 1321 1321 1321 96 97 97 3 502 502 501 501 502 84 84 84 84 229 228 1075 148 148 147 148 148 149 148 148 20 4 4 513 496 16 16 16 196 4 4 4 4 965 516 517 7 511 511 4 4 30 516 492 1 449 451 517 517 515 514 514 517 517 516 196 196 196 30 167 196 515 4 514 518 518 517 189 190 189 190 161 30 189 190 365 209 71 72 72 5 5 501 2041 2048 1953 480 481 136 134 5 502 39 1484 1482 459 113 82 459 459 50 459 136 459 124 43 90 15 113 67 82 30 366 365 366 306 306 329 329 327 506 361 1567 98 459 98 98 76 98 76 98 76 457 458 147 459 146 147 458 309 231 398 116 218 401 401 129 210 427 426 365 459 179 458 384 459 235 365 458 132 132 88 131 88 131 87 18 6 14 2222 2178 467 380 364 365 365 365 365 136 365 233 234 86 235 81 234 235 235 233 365 365 16 247 136 365 247 150 235 235 235 234 235 235 2 2 3 3 3 2 503 501 503 16 15 520 16 2 2 2 2 25 25 25 2 2 2 2 27 27 14 27 6 35 8 35 146 145 624 334 463 464 166 402 107 453 453 328 241 242 86 210 355 356 136 286 44 351 224 205 213 1395 1642 289 122 855 424 637 1135 1135 2271 1882 1109 980 981 707 457 549 178 469 754 18 295 808 1305 354 1228 1011 444 178 1056 423 484 28 18 8 60 40 10 10 37 4 364 197 11 28 1 48 175 201 331 332 6 26 2 2 24 106 67 188 9 1 91 124 58 5 5 2 4 7 2 5 2152 2158 2158 2161 2165 2163 2163 19 1 4 2 2 5 2 3 3 1 3 9 10 1 21 27 22 2162 14 29 15 15 15 15 2157 2158 2160 52 52 52 31 8 21 49 502 502 27 1 26 34 34 24 10 1 1 32 32 2 32 17 7 2 2 22 9 6 1 4 22 25 1 24 1 7 24 20 3 22 20 10 1 1 1 1 1 1 6 6 6 1 3 11 1 6 8 7 33 33 33 10 19 6 22 2 3 19 18 25 2 22 77 22 77 1323 1289 62 9 1320 501 108 12 217 10 10 1469 1347 223 1436 37 1439 38 1096 580 1467 1468 1468 1469 1470 1467 1385 95 1471 1467 777 790 1472 1472 112 871 852 843 22 1472 217 874 852 847 1 30 847 2 222 222 97 141 274 20 273 20 273 209 160 248 966 1670 1671 1652 43 43 1320 414 132 781 101 281 963 1365 1366 1366 90 197 1131 3 139 1195 1322 7 1298 20 132 1274 439 973 837 502 1237 90 1131 198 896 514 1312 7 6 1 3 4 136 135 501 60 60 41 60 42 9073 9086 457 8846 501 9084 501 8886 501 498 496 497 497 204 329 49 451 204 484 196 195 196 1558 329 332 916 679 333 2174 128 724 196 271 2073 670 1594 1595 764 1275 19 19 3 18 70 70 1648 1243 502 1652 248 886 862 969 5 5 969 50 968 966 969 965 963 534 494 822 1126 88 501 502 501 331 177 502 501 522 522 518 49 502 502 501 501 502 501 500 501 502 502 502 502 501 181 351 87 420 502 502 502 502 500 501 501 516 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 355 132 132 132 859 150 153 773 8 105 67 11 717 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 Protocol independent device support routines. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitmap.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/sched/mm.h> #include <linux/smpboot.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> #include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <trace/events/qdisc.h> #include <trace/events/xdp.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/rps.h> #include <linux/phy_link_topology.h> #include "dev.h" #include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; WRITE_ONCE(net->dev_base_seq, val ?: 1); } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } #ifndef CONFIG_PREEMPT_RT static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key); static int __init setup_backlog_napi_threads(char *arg) { static_branch_enable(&use_backlog_threads_key); return 0; } early_param("thread_backlog_napi", setup_backlog_napi_threads); static bool use_backlog_threads(void) { return static_branch_unlikely(&use_backlog_threads_key); } #else static bool use_backlog_threads(void) { return true; } #endif static inline void backlog_lock_irq_save(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); else local_irq_save(*flags); } static inline void backlog_lock_irq_disable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_lock_irq(&sd->input_pkt_queue.lock); else local_irq_disable(); } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, unsigned long *flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); else local_irq_restore(*flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) spin_unlock_irq(&sd->input_pkt_queue.lock); else local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); if (!name_node) return NULL; INIT_HLIST_NODE(&name_node->hlist); name_node->dev = dev; name_node->name = name; return name_node; } static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { struct netdev_name_node *name_node; name_node = netdev_name_node_alloc(dev, dev->name); if (!name_node) return NULL; INIT_LIST_HEAD(&name_node->list); return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) { kfree(name_node); } static void netdev_name_node_add(struct net *net, struct netdev_name_node *name_node) { hlist_add_head_rcu(&name_node->hlist, dev_name_hash(net, name_node->name)); } static void netdev_name_node_del(struct netdev_name_node *name_node) { hlist_del_rcu(&name_node->hlist); } static struct netdev_name_node *netdev_name_node_lookup(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, const char *name) { struct hlist_head *head = dev_name_hash(net, name); struct netdev_name_node *name_node; hlist_for_each_entry_rcu(name_node, head, hlist) if (!strcmp(name_node->name, name)) return name_node; return NULL; } bool netdev_name_in_use(struct net *net, const char *name) { return netdev_name_node_lookup(net, name); } EXPORT_SYMBOL(netdev_name_in_use); int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (name_node) return -EEXIST; name_node = netdev_name_node_alloc(dev, name); if (!name_node) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } static void netdev_name_node_alt_free(struct rcu_head *head) { struct netdev_name_node *name_node = container_of(head, struct netdev_name_node, rcu); kfree(name_node->name); netdev_name_node_free(name_node); } static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); list_del(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; /* lookup might have found our primary name or a name belonging * to another device. */ if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; __netdev_name_node_alt_destroy(name_node); return 0; } static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { list_del(&name_node->list); netdev_name_node_alt_free(&name_node->rcu); } } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); /* We reserved the ifindex, this can't fail */ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL)); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); ASSERT_RTNL(); xa_erase(&net->dev_by_index, dev->ifindex); netdev_for_each_altname(dev, name_node) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = { .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock), }; EXPORT_PER_CPU_SYMBOL(softnet_data); /* Page_pool has a lockless array/stack to alloc/recycle pages. * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return READ_ONCE(dev->ifindex); } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) { int k = stack->num_paths++; if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) return NULL; return &stack->path[k]; } int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack) { const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, }; struct net_device_path *path; int ret = 0; memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; path = dev_fwd_path(stack); if (!path) return -1; memset(path, 0, sizeof(struct net_device_path)); ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); if (ret < 0) return -1; if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } if (!ctx.dev) return ret; path = dev_fwd_path(stack); if (!path) return -1; path->type = DEV_PATH_ETHERNET; path->dev = ctx.dev; return ret; } EXPORT_SYMBOL_GPL(dev_fill_forward_path); /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct * netdev_napi_by_id(struct net *net, unsigned int napi_id) { struct napi_struct *napi; napi = napi_by_id(napi_id); if (!napi) return NULL; if (WARN_ON_ONCE(!napi->dev)) return NULL; if (!net_eq(net, dev_net(napi->dev))) return NULL; return napi; } /** * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it * @net: the applicable net namespace * @napi_id: ID of a NAPI of a target device * * Find a NAPI instance with @napi_id. Lock its device. * The device must be in %NETREG_REGISTERED state for lookup to succeed. * netdev_unlock() must be called to release it. * * Return: pointer to NAPI, its device with lock held, NULL if not found. */ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) { struct napi_struct *napi; struct net_device *dev; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { rcu_read_unlock(); return NULL; } dev = napi->dev; dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (!dev) return NULL; rcu_read_lock(); napi = netdev_napi_by_id(net, napi_id); if (napi && napi->dev != dev) napi = NULL; rcu_read_unlock(); if (!napi) netdev_unlock(dev); return napi; } /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct netdev_name_node *node_name; node_name = netdev_name_node_lookup_rcu(net, name); return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /* Deprecated for new users, call netdev_get_by_name() instead */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * netdev_get_by_name() - find a device by its name * @net: the applicable net namespace * @name: name to find * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use netdev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_name(net, name); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /* Deprecated for new users, call netdev_get_by_index() instead */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * netdev_get_by_index() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * @tracker: tracking object for the acquired reference * @gfp: allocation flags for the tracker * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * netdev_put() to indicate they have finished with it. */ struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (dev) netdev_tracker_alloc(dev, tracker, gfp); return dev; } EXPORT_SYMBOL(netdev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } /* Release the held reference on the net_device, and if the net_device * is still registered try to lock the instance lock. If device is being * unregistered NULL will be returned (but the reference has been released, * either way!) * * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ struct net_device *__netdev_put_lock(struct net_device *dev) { netdev_lock(dev); if (dev->reg_state > NETREG_REGISTERED) { netdev_unlock(dev); dev_put(dev); return NULL; } dev_put(dev); return dev; } /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. If a valid device * with @ifindex is found it will be returned with netdev->lock held. * netdev_unlock() must be called to release it. * * Return: pointer to a device with lock held, NULL if not found. */ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index(net, ifindex); if (!dev) return NULL; return __netdev_put_lock(dev); } struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) { if (dev) netdev_unlock(dev); do { rcu_read_lock(); dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); if (!dev) { rcu_read_unlock(); return NULL; } dev_hold(dev); rcu_read_unlock(); dev = __netdev_put_lock(dev); if (dev) return dev; (*index)++; } while (true); } static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) { unsigned int seq; do { seq = read_seqbegin(&netdev_rename_lock); strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } netdev_copy_name(dev, name); ret = 0; out: rcu_read_unlock(); return ret; } static bool dev_addr_cmp(struct net_device *dev, unsigned short type, const char *ha) { return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); /** * dev_getbyhwaddr() - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold * rtnl_lock. * * Context: rtnl_lock() must be held. * Return: pointer to the net_device, or NULL if not found */ struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; ASSERT_RTNL(); for_each_netdev(net, dev) if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @res: result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *res) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; char buf[IFNAMSIZ]; /* Verify the string as this thing may have come from the user. * There must be one "%d" and no other "%" characters. */ p = strchr(name, '%'); if (!p || p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { struct netdev_name_node *name_node; netdev_for_each_altname(d, name_node) { if (!sscanf(name_node->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); bitmap_free(inuse); if (i == max_netdevices) return -ENFILE; /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ strscpy(buf, name, IFNAMSIZ); snprintf(res, IFNAMSIZ, buf, i); return i; } /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */ static int dev_prep_valid_name(struct net *net, struct net_device *dev, const char *want_name, char *out_name, int dup_errno) { if (!dev_valid_name(want_name)) return -EINVAL; if (strchr(want_name, '%')) return __dev_alloc_name(net, want_name, out_name); if (netdev_name_in_use(net, want_name)) return -dup_errno; if (out_name != want_name) strscpy(out_name, want_name, IFNAMSIZ); return 0; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE); } EXPORT_SYMBOL(dev_alloc_name); static int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { int ret; ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST); return ret < 0 ? ret : 0; } /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; ASSERT_RTNL_NET(net); if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; memcpy(oldname, dev->name, IFNAMSIZ); write_seqlock_bh(&netdev_rename_lock); err = dev_get_valid_name(net, dev, newname); write_sequnlock_bh(&netdev_rename_lock); if (err < 0) return err; if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); return ret; } netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); synchronize_net(); netdev_name_node_add(net, dev->name_node); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); memcpy(oldname, newname, IFNAMSIZ); WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { netdev_err(dev, "name change rollback failed: %d\n", ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); new_alias = rcu_replace_pointer(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, * to be called when rtnl lock is already held. * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void __netdev_notify_peers(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); } EXPORT_SYMBOL(__netdev_notify_peers); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int napi_threaded_poll(void *data); static int napi_kthread_create(struct napi_struct *n) { int err = 0; /* Create and wake up the kthread once to put it in * TASK_INTERRUPTIBLE mode to avoid the blocked task * warning and work with loadavg. */ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", n->dev->name, n->napi_id); if (IS_ERR(n->thread)) { err = PTR_ERR(n->thread); pr_err("kthread_run failed with err %d\n", err); n->thread = NULL; } return err; } static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ if (dev->dev.parent) pm_runtime_resume(dev->dev.parent); if (!netif_device_present(dev)) return -ENODEV; } /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev, extack); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); netif_set_up(dev, false); netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) N(XDP_FEAT_CHANGE) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int call_netdevice_register_notifiers(struct notifier_block *nb, struct net_device *dev) { int err; err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) return err; if (!(dev->flags & IFF_UP)) return 0; call_netdevice_notifier(nb, NETDEV_UP, dev); return 0; } static void call_netdevice_unregister_notifiers(struct notifier_block *nb, struct net_device *dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } static int call_netdevice_register_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; int err; for_each_netdev(net, dev) { err = call_netdevice_register_notifiers(nb, dev); if (err) goto rollback; } return 0; rollback: for_each_netdev_continue_reverse(net, dev) call_netdevice_unregister_notifiers(nb, dev); return err; } static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, struct net *net) { struct net_device *dev; for_each_netdev(net, dev) call_netdevice_unregister_notifiers(nb, dev); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); __rtnl_net_unlock(net); if (err) goto rollback; } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: for_each_net_continue_reverse(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); __rtnl_net_unlock(net); } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); static int __register_netdevice_notifier_net(struct net *net, struct notifier_block *nb, bool ignore_call_fail) { int err; err = raw_notifier_chain_register(&net->netdev_chain, nb); if (err) return err; if (dev_boot_phase) return 0; err = call_netdevice_register_net_notifiers(nb, net); if (err && !ignore_call_fail) goto chain_unregister; return 0; chain_unregister: raw_notifier_chain_unregister(&net->netdev_chain, nb); return err; } static int __unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; err = raw_notifier_chain_unregister(&net->netdev_chain, nb); if (err) return err; call_netdevice_unregister_net_notifiers(nb, net); return 0; } /** * register_netdevice_notifier_net - register a per-netns network notifier block * @net: network namespace * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); /** * unregister_netdevice_notifier_net - unregister a per-netns * network notifier block * @net: network namespace * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier_net(). The notifier is unlinked from the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); static void __move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, struct notifier_block *nb) { __unregister_netdevice_notifier_net(src_net, nb); __register_netdevice_notifier_net(dst_net, nb, true); } static void rtnl_net_dev_lock(struct net_device *dev) { bool again; do { struct net *net; again = false; /* netns might be being dismantled. */ rcu_read_lock(); net = dev_net_rcu(dev); net_passive_inc(net); rcu_read_unlock(); rtnl_net_lock(net); #ifdef CONFIG_NET_NS /* dev might have been moved to another netns. */ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { rtnl_net_unlock(net); net_passive_dec(net); again = true; } #endif } while (again); } static void rtnl_net_dev_unlock(struct net_device *dev) { struct net *net = dev_net(dev); rtnl_net_unlock(net); net_passive_dec(net); } int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; rtnl_net_dev_lock(dev); err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } rtnl_net_dev_unlock(dev); return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; rtnl_net_dev_lock(dev); list_del(&nn->list); err = __unregister_netdevice_notifier_net(dev_net(dev), nb); rtnl_net_dev_unlock(dev); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); static void move_netdevice_notifiers_dev_net(struct net_device *dev, struct net *net) { struct netdev_net_notifier *nn; list_for_each_entry(nn, &dev->net_notifier_list, list) __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); int ret; ASSERT_RTNL(); /* Run per-netns notifier block chain first, then run the global one. * Hopefully, one day, the global one is going to be removed after * all notifier block registrators get converted to be per-netns. */ ret = raw_notifier_call_chain(&net->netdev_chain, val, info); if (ret & NOTIFY_STOP_MASK) return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers_info_robust - call per-netns notifier blocks * for and rollback on error * @val_up: value passed unmodified to notifier function * @val_down: value passed unmodified to the notifier function when * recovering from an error on @val_up * @info: notifier information data * * Call all per-netns network notifier blocks, but not notifier blocks on * the global notifier chain. Parameters and return value are as for * raw_notifier_call_chain_robust(). */ static int call_netdevice_notifiers_info_robust(unsigned long val_up, unsigned long val_down, struct netdev_notifier_info *info) { struct net *net = dev_net(info->dev); ASSERT_RTNL(); return raw_notifier_call_chain_robust(&net->netdev_chain, val_up, val_down, info); } static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_info info = { .dev = dev, .extack = extack, }; return call_netdevice_notifiers_info(val, &info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 0) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted = atomic_read(&netstamp_wanted); while (wanted > 1) { if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; skb->tstamp_type = SKB_CLOCK_REALTIME; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, bool check_mtu) { int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, true); } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /** * dev_nit_active - return true if any network interface taps are in use * * @dev: network device to check for the presence of taps */ bool dev_nit_active(struct net_device *dev) { return !list_empty(&net_hotdata.ptype_all) || !list_empty(&dev->ptype_all); } EXPORT_SYMBOL_GPL(dev_nit_active); /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; struct sk_buff *skb2 = NULL; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) continue; /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &net_hotdata.ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static struct static_key xps_needed __read_mostly; static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } if (old_maps) RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev_maps->num_tc; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, enum xps_map_type type) { static_key_slow_dec_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_maps[type], NULL); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, u16 offset, u16 count) { struct xps_dev_maps *dev_maps; bool active = false; int i, j; dev_maps = xmap_dereference(dev->xps_maps[type]); if (!dev_maps) return; for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, type); if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) clean_xps_maps(dev, XPS_RXQS, offset, count); clean_xps_maps(dev, XPS_CPUS, offset, count); mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Copy xps maps at a given index */ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, struct xps_dev_maps *new_dev_maps, int index, int tc, bool skip_tc) { int i, tci = index * dev_maps->num_tc; struct xps_map *map; /* copy maps belonging to foreign traffic classes */ for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && skip_tc) continue; /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; unsigned int nr_ids; WARN_ON_ONCE(index >= dev->num_tx_queues); if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps[type]); if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're * setting up now, as dev->num_tc or nr_ids could have been updated in * between. We could try to be smart, but let's be safe instead and only * copy foreign traffic classes if the two map sizes match. */ if (dev_maps && dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) { new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = 0; j < nr_ids; j++) { bool skip_tc = false; tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; skip_tc = true; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } if (copy) xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) continue; if (copy) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); if (map == new_map) continue; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { tci = j * dev_maps->num_tc; for (i = 0; i < dev_maps->num_tc; i++, tci++) { if (i == tc && netif_attr_test_mask(j, mask, dev_maps->nr_ids) && netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; active |= remove_xps_queue(dev_maps, copy ? old_dev_maps : NULL, tci, index); } } if (old_dev_maps) kfree_rcu(old_dev_maps, rcu); /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); net_shaper_set_real_num_tx_queues(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used * @dev: Network device * @txq: Actual number of TX queues * @rxq: Actual number of RX queues * * Set the real number of both TX and RX queues. * Does nothing if the number of queues is already correct. */ int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq) { unsigned int old_rxq = dev->real_num_rx_queues; int err; if (txq < 1 || txq > dev->num_tx_queues || rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; /* Start from increases, so the error path only does decreases - * decreases can't fail. */ if (rxq > dev->real_num_rx_queues) { err = netif_set_real_num_rx_queues(dev, rxq); if (err) return err; } if (txq > dev->real_num_tx_queues) { err = netif_set_real_num_tx_queues(dev, txq); if (err) goto undo_rx; } if (rxq < dev->real_num_rx_queues) WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); if (txq < dev->real_num_tx_queues) WARN_ON(netif_set_real_num_tx_queues(dev, txq)); return 0; undo_rx: WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); return err; } EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_set_tso_max_size() - set the max size of TSO frames supported * @dev: netdev to update * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. * Unless explicitly set the stack will assume the value of * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); if (size < READ_ONCE(dev->gso_ipv4_max_size)) netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); /** * netif_set_tso_max_segs() - set the max number of segs supported for TSO * @dev: netdev to update * @segs: max number of TCP segments * * Set the limit on the number of TCP segments the device can generate from * a single TSO super-frame. * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. */ void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) { dev->tso_max_segs = segs; if (segs < READ_ONCE(dev->gso_max_segs)) netif_set_gso_max_segs(dev, segs); } EXPORT_SYMBOL(netif_set_tso_max_segs); /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update * @from: netdev from which to copy the limits */ void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) { netif_set_tso_max_size(to, from->tso_max_size); netif_set_tso_max_segs(to, from->tso_max_segs); } EXPORT_SYMBOL(netif_inherit_tso_max); /** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { cpumask_var_t cpus; int cpu, count = 0; if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) return 1; cpumask_copy(cpus, cpu_online_mask); for_each_cpu(cpu, cpus) { ++count; cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); } free_cpumask_var(cpus); return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_drop_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(dev_kfree_skb_irq_reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (in_hardirq() || irqs_disabled()) dev_kfree_skb_irq_reason(skb, reason); else kfree_skb_reason(skb, reason); } EXPORT_SYMBOL(dev_kfree_skb_any_reason); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { DEBUG_NET_WARN_ON_ONCE(qcount == 0); hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } skb_dump(KERN_WARNING, skb, false); WARN(1, "%s: caps=(%pNF, %pNF)\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } if (!skb_frags_readable(skb)) { return -EFAULT; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); ret = -EINVAL; if (unlikely(offset >= skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n", offset, skb_headlen(skb)); goto out; } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) { DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n", offset + sizeof(__sum16), skb_headlen(skb)); goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } ret = skb_ensure_writable(skb, offset + sizeof(__le32)); if (ret) goto out; crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return vlan_get_protocol_and_depth(skb, type, depth); } /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb); } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = skb_frag_page(frag); if (page && PageHighMem(page)) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { __be16 type; type = skb_network_protocol(skb, NULL); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb))) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb); return features & ~NETIF_F_GSO_MASK; } /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb_mark_not_on_list(skb); rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM) return 0; if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && skb_network_header_len(skb) != sizeof(struct ipv6hdr) && !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): return 0; } } sw_checksum: return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: dev_core_stats_tx_dropped_inc(dev); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size && skb_transport_header_was_set(skb)) { u16 gso_segs = shinfo->gso_segs; unsigned int hdr_len; /* mac layer + network layer */ hdr_len = skb_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, hdr_len, sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { struct udphdr _udphdr; if (skb_header_pointer(skb, hdr_len, sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) { int payload = skb->len - hdr_len; /* Malicious packet. */ if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q, struct sk_buff **to_free, struct netdev_queue *txq) { int rc; rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK; if (rc == NET_XMIT_SUCCESS) trace_qdisc_enqueue(q, txq, skb); return rc; } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP); if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) && qdisc_run_begin(q)) { /* Retest nolock_qdisc_is_empty() within the protection * of q->seqlock to protect from racing with requeuing. */ if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); qdisc_run_end(q); goto no_lock_out; } qdisc_bstats_cpu_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, NULL, true) && !nolock_qdisc_is_empty(q)) __qdisc_run(q); qdisc_run_end(q); return NET_XMIT_SUCCESS; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); no_lock_out: if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); return rc; } if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return NET_XMIT_DROP; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit * and then other tasks will only enqueue packets. The packets will be * sent after the qdisc owner is scheduled again. To prevent this * scenario the task always serialize on the lock. */ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct netdev_queue * netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) { int qm = skb_get_queue_mapping(skb); return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); } #ifndef CONFIG_PREEMPT_RT static bool netdev_xmit_txqueue_skipped(void) { return __this_cpu_read(softnet_data.xmit.skip_txqueue); } void netdev_xmit_skip_txqueue(bool skip) { __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #else static bool netdev_xmit_txqueue_skipped(void) { return current->net_xmit.skip_txqueue; } void netdev_xmit_skip_txqueue(bool skip) { current->net_xmit.skip_txqueue = skip; } EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_NET_XGRESS static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, enum skb_drop_reason *drop_reason) { int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); struct tcf_result res; if (!miniq) return ret; /* Global bypass */ if (!static_branch_likely(&tcf_sw_enabled_key)) return ret; /* Block-wise bypass */ if (tcf_block_bypass_sw(miniq->block)) return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); /* Only tcf related quirks below. */ switch (ret) { case TC_ACT_SHOT: *drop_reason = tcf_get_drop_reason(skb); mini_qdisc_qstats_cpu_drop(miniq); break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(res.classid); break; } #endif /* CONFIG_NET_CLS_ACT */ return ret; } static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); void tcx_inc(void) { static_branch_inc(&tcx_needed_key); } void tcx_dec(void) { static_branch_dec(&tcx_needed_key); } static __always_inline enum tcx_action_base tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, const bool needs_mac) { const struct bpf_mprog_fp *fp; const struct bpf_prog *prog; int ret = TCX_NEXT; if (needs_mac) __skb_push(skb, skb->mac_len); bpf_mprog_foreach_prog(entry, fp, prog) { bpf_compute_data_pointers(skb); ret = bpf_prog_run(prog, skb); if (ret != TCX_NEXT) break; } if (needs_mac) __skb_pull(skb, skb->mac_len); return tcx_action_code(skb, ret); } static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, true); if (sch_ret != TC_ACT_UNSPEC) goto ingress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); ingress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* skb_mac_header check was done by BPF, so we can safely * push the L2 header back before redirecting to another * netdev. */ __skb_push(skb, skb->mac_len); if (skb_do_redirect(skb) == -EAGAIN) { __skb_pull(skb, skb->mac_len); *another = true; break; } *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */ if (static_branch_unlikely(&tcx_needed_key)) { sch_ret = tcx_run(entry, skb, false); if (sch_ret != TC_ACT_UNSPEC) goto egress_verdict; } sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason); egress_verdict: switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; bpf_net_ctx_clear(bpf_net_ctx); return NULL; } bpf_net_ctx_clear(bpf_net_ctx); return skb; } #else static __always_inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { return skb; } static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { return skb; } #endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } EXPORT_SYMBOL(netdev_pick_tx); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev); else queue_index = netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit() - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * Return: * * 0 - buffer successfully transmitted * * positive qdisc return code - NET_XMIT_DROP etc. * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { skb = nf_hook_egress(skb, &rc, dev); if (!skb) goto out; } netdev_xmit_skip_txqueue(false); nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); if (netdev_xmit_txqueue_skipped()) txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); if (!txq) txq = netdev_core_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); return ret; drop: dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ static DEFINE_PER_CPU(struct task_struct *, backlog_napi); int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { struct task_struct *thread; lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); if (thread) { if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi)) goto use_local_napi; set_bit(NAPI_STATE_SCHED_THREADED, &napi->state); wake_up_process(thread); return; } } use_local_napi: list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { u32 head; #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u16 rxq_index; u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; WRITE_ONCE(rflow->filter, rc); if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: #endif head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); rps_input_queue_tail_save(&rflow->last_qtail, head); } WRITE_ONCE(rflow->cpu, next_cpu); return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match. * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). */ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) goto try_rps; next_cpu = ident & net_hotdata.rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ static void trigger_rx_softirq(void *data) { struct softnet_data *sd = data; __raise_softirq_irqoff(NET_RX_SOFTIRQ); smp_store_release(&sd->defer_ipi_scheduled, 0); } /* * After we queued a packet into sd->input_pkt_queue, * we need to make sure this queue is serviced soon. * * - If this is another cpu queue, link it to our rps_ipi_list, * and make sure we will process rps_ipi_list from net_rx_action(). * * - If this is our own queue, NAPI schedule our backlog. * Note that this also raises NET_RX_SOFTIRQ. */ static void napi_schedule_rps(struct softnet_data *sd) { struct softnet_data *mysd = this_cpu_ptr(&softnet_data); #ifdef CONFIG_RPS if (sd != mysd) { if (use_backlog_threads()) { __napi_schedule_irqoff(&sd->backlog); return; } sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; /* If not called from net_rx_action() or napi_threaded_poll() * we have to raise NET_RX_SOFTIRQ. */ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll) __raise_softirq_irqoff(NET_RX_SOFTIRQ); return; } #endif /* CONFIG_RPS */ __napi_schedule_irqoff(&mysd->backlog); } void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) { unsigned long flags; if (use_backlog_threads()) { backlog_lock_irq_save(sd, &flags); if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); backlog_unlock_irq_restore(sd, &flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); } } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; int max_backlog; u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); if (unlikely(qlen > max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; u32 metalen, act; int off; /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); rxqueue = netif_get_rxqueue(skb); xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); if (skb_is_nonlinear(skb)) { skb_shinfo(skb)->xdp_frags_size = skb->data_len; xdp_buff_set_frags_flag(xdp); } else { xdp_buff_clear_frags_flag(xdp); } orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used */ off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ } /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. */ if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_host != ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull * before calling us again on redirect path. We do not call do_redirect * as we leave that up to the caller. * * Caller is responsible for managing lifetime of skb (i.e. calling * kfree_skb in response to actions it cannot handle/XDP_DROP). */ switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; } return act; } static int netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) return 0; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); troom = skb->tail + skb->data_len - skb->end; err = pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC); if (err) return err; return skb_linearize(skb); } static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM * bytes. This is the guarantee that also native XDP provides, * thus we need to do it here as well. */ mac_len = skb->data - skb_mac_header(skb); __skb_push(skb, mac_len); if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } __skb_pull(*pskb, mac_len); act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: kfree_skb(*pskb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. This also means * that XDP packets are able to starve other packets going through a qdisc, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(*pskb, xdp_prog); break; } bpf_net_ctx_clear(bpf_net_ctx); return XDP_DROP; } bpf_net_ctx_clear(bpf_net_ctx); } return XDP_PASS; out_redir: bpf_net_ctx_clear(bpf_net_ctx); kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** * __netif_rx - Slightly optimized version of netif_rx * @skb: buffer to post * * This behaves as netif_rx except that it does not disable bottom halves. * As a result this function may only be invoked from the interrupt context * (either hard or soft interrupt). */ int __netif_rx(struct sk_buff *skb) { int ret; lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); return ret; } EXPORT_SYMBOL(__netif_rx); /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process via the backlog NAPI device. It * always succeeds. The buffer may be dropped during processing for * congestion control or by the protocol layers. * The network buffer is passed via the backlog NAPI device. Modern NIC * driver should use NAPI and GRO. * This function can used from interrupt and from process context. The * caller from process context must not disable interrupts before invoking * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; if (need_bh_off) local_bh_disable(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); if (need_bh_off) local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED)) trace_consume_skb(skb, net_tx_action); else trace_kfree_skb(skb, net_tx_action, get_kfree_skb_cb(skb)->reason, NULL); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, get_kfree_skb_cb(skb)->reason); } } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); rcu_read_lock(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between * STATE_DEACTIVATED flag being set and * qdisc_reset()/some_qdisc_is_busy() in * dev_deactivate(), so we can safely bail out * early here to avoid data race between * qdisc_deactivate() and some_qdisc_is_busy() * for lockless qdisc. */ clear_bit(__QDISC_STATE_SCHED, &q->state); continue; } clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } rcu_read_unlock(); } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); #if !defined(CONFIG_DEBUG_NET) /* We plan to no longer reset the transport header here. * Give some time to fuzzers and dev build to catch bugs * in network stacks. */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); #endif skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), &skb); migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } } if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) goto another_round; if (!skb) goto out; nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; break; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. */ __vlan_hwaccel_clear_tag(skb); skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; if (vlan_do_receive(&skb)) /* After stripping off 802.1P header with vlan 0 * vlan dev is found for inner header. */ goto another_round; else if (unlikely(!skb)) goto out; else /* We have stripped outer 802.1P vlan 0 header. * But could not find vlan dev. * check again for vlan id to set OTHERHOST. */ goto check_vlan_id; } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { int ret; trace_netif_receive_skb_entry(skb); ret = netif_receive_skb_internal(skb); trace_netif_receive_skb_exit(ret); return ret; } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; if (trace_netif_receive_skb_list_entry_enabled()) { list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); } netif_receive_skb_list_internal(head); trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct sk_buff_head list; struct softnet_data *sd; __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) { #if IS_ENABLED(CONFIG_RPS) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; backlog_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); backlog_unlock_irq_enable(sd); return do_flush; #endif /* without RPS we can't safely check input_pkt_queue: during a * concurrent remote skb_queue_splice() we can detect as empty both * input_pkt_queue and process_queue even if the latter could end-up * containing a lot of packets. */ return true; } struct flush_backlogs { cpumask_t flush_cpus; struct work_struct w[]; }; static struct flush_backlogs *flush_backlogs_alloc(void) { return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), GFP_KERNEL); } static struct flush_backlogs *flush_backlogs_fallback; static DEFINE_MUTEX(flush_backlogs_mutex); static void flush_all_backlogs(void) { struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; if (!ptr) { mutex_lock(&flush_backlogs_mutex); ptr = flush_backlogs_fallback; } cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); for_each_online_cpu(cpu) { if (flush_required(cpu)) { INIT_WORK(&ptr->w[cpu], flush_backlog); queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of * them. */ for_each_cpu(cpu, &ptr->flush_cpus) flush_work(&ptr->w[cpu]); cpus_read_unlock(); if (ptr != flush_backlogs_fallback) kfree(ptr); else mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (!use_backlog_threads() && remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return !use_backlog_threads() && sd->rps_ipi_list; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(net_hotdata.dev_rx_weight); while (again) { struct sk_buff *skb; local_lock_nested_bh(&softnet_data.process_queue_bh_lock); while ((skb = __skb_dequeue(&sd->process_queue))) { local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); if (++work >= quota) { rps_input_queue_head_add(sd, work); return work; } local_lock_nested_bh(&softnet_data.process_queue_bh_lock); } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); backlog_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state &= NAPIF_STATE_THREADED; again = false; } else { local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); } backlog_unlock_irq_enable(sd); } if (work) rps_input_queue_head_add(sd, work); return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); do { if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new, timeout = 0; bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (work_done) { if (n->gro.bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } /* * When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer. */ gro_flush(&n->gro, !!timeout); gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } WRITE_ONCE(n->list_owner, -1); val = READ_ONCE(n->state); do { WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | NAPIF_STATE_SCHED_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); return ret; } EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ if (!READ_ONCE(sd->defer_list)) return; spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; sd->defer_count = 0; spin_unlock(&sd->defer_lock); while (skb != NULL) { next = skb->next; napi_consume_skb(skb, 1); skb = next; } } #if defined(CONFIG_NET_RX_BUSY_POLL) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; } /* Flush too old packets. If HZ < 1000, flush all packets */ gro_flush(&napi->gro, HZ >= 1000); gro_normal_list(&napi->gro); clear_bit(NAPI_STATE_SCHED, &napi->state); } enum { NAPI_F_PREFER_BUSY_POLL = 1, NAPI_F_END_ON_RESCHED = 2, }; static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; bool skip_schedule = false; unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; } } /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) __busy_poll_stop(napi, skip_schedule); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } static void __napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; void *have_poll_lock = NULL; struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); restart: napi_poll = NULL; napi = napi_by_id(napi_id); if (!napi) return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); for (;;) { int work = 0; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); skb_defer_free_flush(this_cpu_ptr(&softnet_data)); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (flags & NAPI_F_END_ON_RESCHED) break; if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = NAPI_F_END_ON_RESCHED; if (prefer_busy_poll) flags |= NAPI_F_PREFER_BUSY_POLL; __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; rcu_read_lock(); __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); void napi_suspend_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { unsigned long timeout = napi_get_irq_suspend_timeout(napi); if (timeout) hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } rcu_read_unlock(); } void napi_resume_irqs(unsigned int napi_id) { struct napi_struct *napi; rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { /* If irq_suspend_timeout is set to 0 between the call to * napi_suspend_irqs and now, the original value still * determines the safety timeout as intended and napi_watchdog * will resume irq processing. */ if (napi_get_irq_suspend_timeout(napi)) { local_bh_disable(); napi_schedule(napi); local_bh_enable(); } } rcu_read_unlock(); } #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { napi->gro.cached_napi_id = napi_id; WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } static void napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); WARN_ON_ONCE(napi_by_id(napi_id)); __napi_hash_add_with_id(napi, napi_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } static void napi_hash_add(struct napi_struct *napi) { unsigned long flags; if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock_irqsave(&napi_hash_lock, flags); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock_irqrestore(&napi_hash_lock, flags); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ static void napi_hash_del(struct napi_struct *napi) { unsigned long flags; spin_lock_irqsave(&napi_hash_lock, flags); hlist_del_init_rcu(&napi->napi_hash_node); spin_unlock_irqrestore(&napi_hash_lock, flags); } static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); } return HRTIMER_NORESTART; } int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); if (dev->threaded == threaded) return 0; if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { threaded = false; break; } } } } WRITE_ONCE(dev->threaded, threaded); /* Make sure kthread is created before THREADED bit * is set. */ smp_mb__before_atomic(); /* Setting/unsetting threaded mode on a napi might not immediately * take effect, if the current napi instance is actively being * polled. In this case, the switch between threaded mode and * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ list_for_each_entry(napi, &dev->napi_list, dev_list) assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } EXPORT_SYMBOL(dev_set_threaded); /** * netif_queue_set_napi - Associate queue with the napi * @dev: device to which NAPI and queue belong * @queue_index: Index of queue * @type: queue type as RX or TX * @napi: NAPI context, pass NULL to clear previously set NAPI * * Set queue with its corresponding napi context. This should be done after * registering the NAPI handler for the queue-vector and the queues have been * mapped to the corresponding interrupt vector. */ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; if (WARN_ON_ONCE(napi && !napi->dev)) return; if (dev->reg_state >= NETREG_REGISTERED) ASSERT_RTNL(); switch (type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(dev, queue_index); rxq->napi = napi; return; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(dev, queue_index); txq->napi = napi; return; default: return; } } EXPORT_SYMBOL(netif_queue_set_napi); static void netif_napi_irq_notify(struct irq_affinity_notify *notify, const cpumask_t *mask) { struct napi_struct *napi = container_of(notify, struct napi_struct, notify); #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; int err; #endif if (napi->config && napi->dev->irq_affinity_auto) cpumask_copy(&napi->config->affinity_mask, mask); #ifdef CONFIG_RFS_ACCEL if (napi->dev->rx_cpu_rmap_auto) { err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); if (err) netdev_warn(napi->dev, "RMAP update failed (%d)\n", err); } #endif } #ifdef CONFIG_RFS_ACCEL static void netif_napi_affinity_release(struct kref *ref) { struct napi_struct *napi = container_of(ref, struct napi_struct, notify.kref); struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; netdev_assert_locked(napi->dev); WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)); if (!napi->dev->rx_cpu_rmap_auto) return; rmap->obj[napi->napi_rmap_idx] = NULL; napi->napi_rmap_idx = -1; cpu_rmap_put(rmap); } int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) { if (dev->rx_cpu_rmap_auto) return 0; dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); if (!dev->rx_cpu_rmap) return -ENOMEM; dev->rx_cpu_rmap_auto = true; return 0; } EXPORT_SYMBOL(netif_enable_cpu_rmap); static void netif_del_cpu_rmap(struct net_device *dev) { struct cpu_rmap *rmap = dev->rx_cpu_rmap; if (!dev->rx_cpu_rmap_auto) return; /* Free the rmap */ cpu_rmap_put(rmap); dev->rx_cpu_rmap = NULL; dev->rx_cpu_rmap_auto = false; } #else static void netif_napi_affinity_release(struct kref *ref) { } int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) { return 0; } EXPORT_SYMBOL(netif_enable_cpu_rmap); static void netif_del_cpu_rmap(struct net_device *dev) { } #endif void netif_set_affinity_auto(struct net_device *dev) { unsigned int i, maxqs, numa; maxqs = max(dev->num_tx_queues, dev->num_rx_queues); numa = dev_to_node(&dev->dev); for (i = 0; i < maxqs; i++) cpumask_set_cpu(cpumask_local_spread(i, numa), &dev->napi_config[i].affinity_mask); dev->irq_affinity_auto = true; } EXPORT_SYMBOL(netif_set_affinity_auto); void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) { int rc; netdev_assert_locked_or_invisible(napi->dev); if (napi->irq == irq) return; /* Remove existing resources */ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) irq_set_affinity_notifier(napi->irq, NULL); napi->irq = irq; if (irq < 0 || (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) return; /* Abort for buggy drivers */ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) return; #ifdef CONFIG_RFS_ACCEL if (napi->dev->rx_cpu_rmap_auto) { rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); if (rc < 0) return; cpu_rmap_get(napi->dev->rx_cpu_rmap); napi->napi_rmap_idx = rc; } #endif /* Use core IRQ notifier */ napi->notify.notify = netif_napi_irq_notify; napi->notify.release = netif_napi_affinity_release; rc = irq_set_affinity_notifier(irq, &napi->notify); if (rc) { netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", rc); goto put_rmap; } set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); return; put_rmap: #ifdef CONFIG_RFS_ACCEL if (napi->dev->rx_cpu_rmap_auto) { cpu_rmap_put(napi->dev->rx_cpu_rmap); napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; napi->napi_rmap_idx = -1; } #endif napi->notify.notify = NULL; napi->notify.release = NULL; } EXPORT_SYMBOL(netif_napi_set_irq_locked); static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; if (n->dev->irq_affinity_auto && test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) irq_set_affinity(n->irq, &n->config->affinity_mask); /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); } else { napi_hash_add(n); n->config->napi_id = n->napi_id; } } static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; napi_hash_del(n); } /* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will * inherit an existing ID try to insert it at the right position. */ static void netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) { unsigned int new_id, pos_id; struct list_head *higher; struct napi_struct *pos; new_id = UINT_MAX; if (napi->config && napi->config->napi_id) new_id = napi->config->napi_id; higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { if (napi_id_valid(pos->napi_id)) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; else pos_id = UINT_MAX; if (pos_id <= new_id) break; higher = &pos->dev_list; } list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } /* Double check that napi_get_frags() allocates skbs with * skb->head being backed by slab, not a page fragment. * This is to make sure bug fixed in 3226b158e67c * ("net: avoid 32 x truesize under-estimation for tiny skbs") * does not accidentally come back. */ static void napi_get_frags_check(struct napi_struct *napi) { struct sk_buff *skb; local_bh_disable(); skb = napi_get_frags(napi); WARN_ON_ONCE(skb && skb->head_frag); napi_free_frags(napi); local_bh_enable(); } void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; gro_init(&napi->gro); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, weight); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable */ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = false; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); netdev_assert_locked(n->dev); set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); do { while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); if (n->config) napi_save_config(n); else napi_hash_del(n); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable_locked); /** * napi_disable() - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. * Takes netdev_lock() for associated net_device. */ void napi_disable(struct napi_struct *n) { netdev_lock(n->dev); napi_disable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_disable); void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); if (n->config) napi_restore_config(n); else napi_hash_add(n); do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable_locked); /** * napi_enable() - enable NAPI scheduling * @n: NAPI context * * Enable scheduling of a NAPI instance. * Must be paired with napi_disable(). * Takes netdev_lock() for associated net_device. */ void napi_enable(struct napi_struct *n) { netdev_lock(n->dev); napi_enable_locked(n); netdev_unlock(n->dev); } EXPORT_SYMBOL(napi_enable); /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { netdev_assert_locked(napi->dev); if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; /* Make sure NAPI is disabled (or was never enabled). */ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) irq_set_affinity_notifier(napi->irq, NULL); if (napi->config) { napi->index = -1; napi->config = NULL; } list_del_rcu(&napi->dev_list); napi_free_frags(napi); gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { int work, weight; weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (napi_is_scheduled(n)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); xdp_do_check_flushed(n); } if (unlikely(work > weight)) netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", n->poll, work, weight); if (likely(work < weight)) return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); return work; } /* The NAPI context has more processing work, but busy-polling * is preferred. Exit early. */ if (napi_prefer_busy_poll(n)) { if (napi_complete_done(n, work)) { /* If timeout is not set, we need to make sure * that the NAPI is re-scheduled. */ napi_schedule(n); } return work; } /* Flush too old packets. If HZ < 1000, flush all packets */ gro_flush(&n->gro, HZ >= 1000); gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); return work; } *repoll = true; return work; } static int napi_poll(struct napi_struct *n, struct list_head *repoll) { bool do_repoll = false; void *have; int work; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); work = __napi_poll(n, &do_repoll); if (do_repoll) list_add_tail(&n->poll_list, repoll); netpoll_poll_unlock(have); return work; } static int napi_thread_wait(struct napi_struct *napi) { set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { /* Testing SCHED_THREADED bit here to make sure the current * kthread owns this napi and could poll on this napi. * Testing SCHED bit is not enough because SCHED bit might be * set by some other busy poll thread or by napi_disable(). */ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) { WARN_ON(!list_empty(&napi->poll_list)); __set_current_state(TASK_RUNNING); return 0; } schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return -1; } static void napi_threaded_poll_loop(struct napi_struct *napi) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; unsigned long last_qs = jiffies; for (;;) { bool repoll = false; void *have; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); sd = this_cpu_ptr(&softnet_data); sd->in_napi_threaded_poll = true; have = netpoll_poll_lock(napi); __napi_poll(napi, &repoll); netpoll_poll_unlock(have); sd->in_napi_threaded_poll = false; barrier(); if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } skb_defer_free_flush(sd); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); if (!repoll) break; rcu_softirq_qs_periodic(last_qs); cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; while (!napi_thread_wait(napi)) napi_threaded_poll_loop(napi); return 0; } static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); start: sd->in_net_rx_action = true; local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; skb_defer_free_flush(sd); if (list_empty(&list)) { if (list_empty(&repoll)) { sd->in_net_rx_action = false; barrier(); /* We need to check if ____napi_schedule() * had refilled poll_list while * sd->in_net_rx_action was true. */ if (!list_empty(&sd->poll_list)) goto start; if (!sd_has_rps_ipi_waiting(sd)) goto end; } break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); else sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); end: bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { struct net_device *dev; netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; /* lookup ignore flag */ bool ignore; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int ____netdev_has_upper_dev(struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .data = (void *)upper_dev, }; return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master) && !upper->ignore) return upper->dev; return NULL; } /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *__netdev_next_upper_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; *ignore = upper->ignore; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = __netdev_next_upper_dev(now, &iter, &ignore); if (!udev) break; if (ignore) continue; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = 0, .data = (void *)upper_dev, }; ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, &priv); } /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static struct net_device *__netdev_next_lower_dev(struct net_device *dev, struct list_head **iter, bool *ignore) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; *ignore = lower->ignore; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; bool ignore; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = __netdev_next_lower_dev(now, &iter, &ignore); if (!ldev) break; if (ignore) continue; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.upper, udev = __netdev_next_upper_dev(dev, &iter, &ignore); udev; udev = __netdev_next_upper_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; bool ignore; for (iter = &dev->adj_list.lower, ldev = __netdev_next_lower_dev(dev, &iter, &ignore); ldev; ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) { if (ignore) continue; if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); static void net_unlink_todo(struct net_device *dev) { if (list_empty(&dev->unlink_list)) list_add_tail(&dev->unlink_list, &net_unlink_list); } #endif static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; #ifdef CONFIG_LOCKDEP if (!priv) return 0; if (priv->flags & NESTED_SYNC_IMM) dev->nested_level = dev->lower_level - 1; if (priv->flags & NESTED_SYNC_TODO) net_unlink_todo(dev); #endif return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, priv); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; adj->ignore = false; netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (__netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (__netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = __netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev, struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, priv); } /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_TODO, .data = NULL, }; __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); static void __netdev_adjacent_dev_set(struct net_device *upper_dev, struct net_device *lower_dev, bool val) { struct netdev_adjacent *adj; adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower); if (adj) adj->ignore = val; adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper); if (adj) adj->ignore = val; } static void netdev_adjacent_dev_disable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, true); } static void netdev_adjacent_dev_enable(struct net_device *upper_dev, struct net_device *lower_dev) { __netdev_adjacent_dev_set(upper_dev, lower_dev, false); } int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; int err; if (!new_dev) return 0; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); return err; } return 0; } EXPORT_SYMBOL(netdev_adjacent_change_prepare); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, .data = NULL, }; if (!new_dev || !old_dev) return; if (new_dev == old_dev) return; netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { struct netdev_nested_priv priv = { .flags = 0, .data = NULL, }; if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static int netdev_offload_xstats_enable_l3(struct net_device *dev, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; int err; int rc; dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), GFP_KERNEL); if (!dev->offload_xstats_l3) return -ENOMEM; rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); err = notifier_to_errno(rc); if (err) goto free_stats; return 0; free_stats: kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; return err; } int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return netdev_offload_xstats_enable_l3(dev, extack); } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_enable); static void netdev_offload_xstats_disable_l3(struct net_device *dev) { struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, }; call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, &info.info); kfree(dev->offload_xstats_l3); dev->offload_xstats_l3 = NULL; } int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); if (!netdev_offload_xstats_enabled(dev, type)) return -EALREADY; switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: netdev_offload_xstats_disable_l3(dev); return 0; } WARN_ON(1); return -EINVAL; } EXPORT_SYMBOL(netdev_offload_xstats_disable); static void netdev_offload_xstats_disable_all(struct net_device *dev) { netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); } static struct rtnl_hw_stats64 * netdev_offload_xstats_get_ptr(const struct net_device *dev, enum netdev_offload_xstats_type type) { switch (type) { case NETDEV_OFFLOAD_XSTATS_TYPE_L3: return dev->offload_xstats_l3; } WARN_ON(1); return NULL; } bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type) { ASSERT_RTNL(); return netdev_offload_xstats_get_ptr(dev, type); } EXPORT_SYMBOL(netdev_offload_xstats_enabled); struct netdev_notifier_offload_xstats_ru { bool used; }; struct netdev_notifier_offload_xstats_rd { struct rtnl_hw_stats64 stats; bool used; }; static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, const struct rtnl_hw_stats64 *src) { dest->rx_packets += src->rx_packets; dest->tx_packets += src->tx_packets; dest->rx_bytes += src->rx_bytes; dest->tx_bytes += src->tx_bytes; dest->rx_errors += src->rx_errors; dest->tx_errors += src->tx_errors; dest->rx_dropped += src->rx_dropped; dest->tx_dropped += src->tx_dropped; dest->multicast += src->multicast; } static int netdev_offload_xstats_get_used(struct net_device *dev, enum netdev_offload_xstats_type type, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_ru report_used = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_used = &report_used, }; int rc; WARN_ON(!netdev_offload_xstats_enabled(dev, type)); rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, &info.info); *p_used = report_used.used; return notifier_to_errno(rc); } static int netdev_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { struct netdev_notifier_offload_xstats_rd report_delta = {}; struct netdev_notifier_offload_xstats_info info = { .info.dev = dev, .info.extack = extack, .type = type, .report_delta = &report_delta, }; struct rtnl_hw_stats64 *stats; int rc; stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return -EINVAL; rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, &info.info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ netdev_hw_stats64_add(stats, &report_delta.stats); if (p_stats) *p_stats = *stats; *p_used = report_delta.used; return notifier_to_errno(rc); } int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *p_stats, bool *p_used, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (p_stats) return netdev_offload_xstats_get_stats(dev, type, p_stats, p_used, extack); else return netdev_offload_xstats_get_used(dev, type, p_used, extack); } EXPORT_SYMBOL(netdev_offload_xstats_get); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, const struct rtnl_hw_stats64 *stats) { report_delta->used = true; netdev_hw_stats64_add(&report_delta->stats, stats); } EXPORT_SYMBOL(netdev_offload_xstats_report_delta); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) { report_used->used = true; } EXPORT_SYMBOL(netdev_offload_xstats_report_used); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *p_stats) { struct rtnl_hw_stats64 *stats; ASSERT_RTNL(); stats = netdev_offload_xstats_get_ptr(dev, type); if (WARN_ON(!stats)) return; netdev_hw_stats64_add(stats, p_stats); } EXPORT_SYMBOL(netdev_offload_xstats_push_delta); /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device * @skb: The packet * @all_slaves: assume all the slaves are active * * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. * %NULL is returned if no slave is found. */ struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_xmit_slave) return NULL; return ops->ndo_get_xmit_slave(dev, skb, all_slaves); } EXPORT_SYMBOL(netdev_get_xmit_slave); static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_sk_get_lower_dev) return NULL; return ops->ndo_sk_get_lower_dev(dev, sk); } /** * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket * @dev: device * @sk: the socket * * %NULL is returned if no lower device is found. */ struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk) { struct net_device *lower; lower = netdev_sk_get_lower_dev(dev, sk); while (lower) { dev = lower; lower = netdev_sk_get_lower_dev(dev, sk); } return dev; } EXPORT_SYMBOL(netdev_sk_get_lowest_dev); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); /** * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; unsigned int promiscuity, flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); promiscuity = dev->promiscuity + inc; if (promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_PROMISC; } else { flags = old_flags | IFF_PROMISC; } WRITE_ONCE(dev->promiscuity, promiscuity); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s promiscuous mode\n", dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; ASSERT_RTNL(); allmulti = dev->allmulti + inc; if (allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (unlikely(inc > 0)) { netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } flags = old_flags & ~IFF_ALLMULTI; } else { flags = old_flags | IFF_ALLMULTI; } WRITE_ONCE(dev->allmulti, allmulti); if (flags != old_flags) { WRITE_ONCE(dev->flags, flags); netdev_info(dev, "%s allmulticast mode\n", dev->flags & IFF_ALLMULTI ? "entered" : "left"); dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags, 0, NULL); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (READ_ONCE(dev->gflags) & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { WRITE_ONCE(dev->tx_queue_len, new_len); res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); WRITE_ONCE(dev->tx_queue_len, orig_len); return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. * @dev: device * @addr: new address * @extack: netlink extended ack */ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, .info.extack = extack, .dev_addr = addr, }; int rc; rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } EXPORT_SYMBOL(dev_pre_changeaddr_notify); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); if (err) return err; if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { err = ops->ndo_set_mac_address(dev, sa); if (err) return err; } dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); ret = dev_set_mac_address(dev, sa, extack); up_write(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; down_read(&dev_addr_sem); rcu_read_lock(); dev = dev_get_by_name_rcu(net, dev_name); if (!dev) { ret = -ENODEV; goto unlock; } if (!dev->addr_len) memset(sa->sa_data, 0, size); else memcpy(sa->sa_data, dev->dev_addr, min_t(size_t, size, dev->addr_len)); sa->sa_family = dev->type; unlock: rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } EXPORT_SYMBOL(dev_get_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (ops->ndo_get_phys_port_name) { err = ops->ndo_get_phys_port_name(dev, name, len); if (err != -EOPNOTSUPP) return err; } return devlink_compat_phys_port_name_get(dev, name, len); } /** * dev_get_port_parent_id - Get the device's port parent identifier * @dev: network device * @ppid: pointer to a storage for the port's parent identifier * @recurse: allow/disallow recursion to lower devices * * Get the devices's port parent identifier */ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; struct net_device *lower_dev; struct list_head *iter; int err; if (ops->ndo_get_port_parent_id) { err = ops->ndo_get_port_parent_id(dev, ppid); if (err != -EOPNOTSUPP) return err; } err = devlink_compat_switch_id_get(dev, ppid); if (!recurse || err != -EOPNOTSUPP) return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) first = *ppid; else if (memcmp(&first, ppid, sizeof(*ppid))) return -EOPNOTSUPP; } return err; } EXPORT_SYMBOL(dev_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have * the same port parent identifier * @a: first network device * @b: second network device */ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; if (dev_get_port_parent_id(a, &a_id, true) || dev_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); } EXPORT_SYMBOL(netdev_port_same_parent_id); /** * dev_change_proto_down - set carrier according to proto_down. * * @dev: device * @proto_down: new value */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; if (proto_down) netif_carrier_off(dev); else netif_carrier_on(dev); WRITE_ONCE(dev->proto_down, proto_down); return 0; } /** * dev_change_proto_down_reason - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value) { u32 proto_down_reason; int b; if (!mask) { proto_down_reason = value; } else { proto_down_reason = dev->proto_down_reason; for_each_set_bit(b, &mask, 32) { if (value & (1 << b)) proto_down_reason |= BIT(b); else proto_down_reason &= ~BIT(b); } } WRITE_ONCE(dev->proto_down_reason, proto_down_reason); } struct bpf_xdp_link { struct bpf_link link; struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ int flags; }; static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags) { if (flags & XDP_FLAGS_HW_MODE) return XDP_MODE_HW; if (flags & XDP_FLAGS_DRV_MODE) return XDP_MODE_DRV; if (flags & XDP_FLAGS_SKB_MODE) return XDP_MODE_SKB; return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB; } static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) { switch (mode) { case XDP_MODE_SKB: return generic_xdp_install; case XDP_MODE_DRV: case XDP_MODE_HW: return dev->netdev_ops->ndo_bpf; default: return NULL; } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, enum bpf_xdp_mode mode) { return dev->xdp_state[mode].link; } static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_xdp_link *link = dev_xdp_link(dev, mode); if (link) return link->link.prog; return dev->xdp_state[mode].prog; } u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog || dev->xdp_state[i].link) count++; return count; } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u8 dev_xdp_sb_prog_count(struct net_device *dev) { u8 count = 0; int i; for (i = 0; i < __MAX_XDP_MODE; i++) if (dev->xdp_state[i].prog && !dev->xdp_state[i].prog->aux->xdp_has_frags) count++; return count; } int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; } return dev->netdev_ops->ndo_bpf(dev, bpf); } EXPORT_SYMBOL_GPL(dev_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); return prog ? prog->aux->id : 0; } static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_xdp_link *link) { dev->xdp_state[mode].link = link; dev->xdp_state[mode].prog = NULL; } static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; int err; if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; } if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; } memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; /* Drivers assume refcnt is already incremented (i.e, prog pointer is * "moved" into driver), so they don't increment it on their own, but * they do decrement refcnt when program is detached or replaced. * Given net_device also owns link/prog, we need to bump refcnt here * to prevent drivers from underflowing it. */ if (prog) bpf_prog_inc(prog); err = bpf_op(dev, &xdp); if (err) { if (prog) bpf_prog_put(prog); return err; } if (mode != XDP_MODE_HW) bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); return 0; } static void dev_xdp_uninstall(struct net_device *dev) { struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { prog = dev_xdp_prog(dev, mode); if (!prog) continue; bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) continue; WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); /* auto-detach link from net device */ link = dev_xdp_link(dev, mode); if (link) link->dev = NULL; else bpf_prog_put(prog); dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; struct net_device *upper; struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; ASSERT_RTNL(); /* either link or prog attachment, never both */ if (link && (new_prog || old_prog)) return -EINVAL; /* link supports only XDP mode flags */ if (link && (flags & ~XDP_FLAGS_MODES)) { NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ if (!num_modes && dev_xdp_prog_count(dev) > 1) { NL_SET_ERR_MSG(extack, "More than one program loaded, unset mode is ambiguous"); return -EINVAL; } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); return -EINVAL; } mode = dev_xdp_mode(dev, flags); /* can't replace attached link */ if (dev_xdp_link(dev, mode)) { NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); return -EBUSY; } /* don't allow if an upper device already has a program */ netdev_for_each_upper_dev_rcu(dev, upper, iter) { if (dev_xdp_prog_count(upper) > 0) { NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); return -EEXIST; } } cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); return -EBUSY; } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; } /* put effective new program into new_prog */ if (link) new_prog = link->link.prog; if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB ? XDP_MODE_DRV : XDP_MODE_SKB; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) { NL_SET_ERR_MSG(extack, "XDP program already attached"); return -EBUSY; } if (!offload && dev_xdp_prog(dev, other_mode)) { NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } if (!offload && bpf_prog_is_offloaded(new_prog->aux)) { NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) { NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; } if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device"); return -EINVAL; } } /* don't call drivers if the effective program didn't change */ if (new_prog != cur_prog) { bpf_op = dev_xdp_bpf_op(dev, mode); if (!bpf_op) { NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog); if (err) return err; } if (link) dev_xdp_set_link(dev, mode, link); else dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } static int dev_xdp_attach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); } static int dev_xdp_detach_link(struct net_device *dev, struct netlink_ext_ack *extack, struct bpf_xdp_link *link) { enum bpf_xdp_mode mode; bpf_op_t bpf_op; ASSERT_RTNL(); mode = dev_xdp_mode(dev, link->flags); if (dev_xdp_link(dev, mode) != link) return -EINVAL; bpf_op = dev_xdp_bpf_op(dev, mode); WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); dev_xdp_set_link(dev, mode, NULL); return 0; } static void bpf_xdp_link_release(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); rtnl_lock(); /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); xdp_link->dev = NULL; } rtnl_unlock(); } static int bpf_xdp_link_detach(struct bpf_link *link) { bpf_xdp_link_release(link); return 0; } static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); kfree(xdp_link); } static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); seq_printf(seq, "ifindex:\t%u\n", ifindex); } static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); u32 ifindex = 0; rtnl_lock(); if (xdp_link->dev) ifindex = xdp_link->dev->ifindex; rtnl_unlock(); info->xdp.ifindex = ifindex; return 0; } static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err = 0; rtnl_lock(); /* link might have been auto-released already, so fail */ if (!xdp_link->dev) { err = -ENOLINK; goto out_unlock; } if (old_prog && link->prog != old_prog) { err = -EPERM; goto out_unlock; } old_prog = link->prog; if (old_prog->type != new_prog->type || old_prog->expected_attach_type != new_prog->expected_attach_type) { err = -EINVAL; goto out_unlock; } if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); goto out_unlock; } mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); if (err) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: rtnl_unlock(); return err; } static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct net *net = current->nsproxy->net_ns; struct bpf_link_primer link_primer; struct netlink_ext_ack extack = {}; struct bpf_xdp_link *link; struct net_device *dev; int err, fd; rtnl_lock(); dev = dev_get_by_index(net, attr->link_create.target_ifindex); if (!dev) { rtnl_unlock(); return -EINVAL; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); link->dev = dev; link->flags = attr->link_create.flags; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto unlock; } err = dev_xdp_attach_link(dev, &extack, link); rtnl_unlock(); if (err) { link->dev = NULL; bpf_link_cleanup(&link_primer); trace_bpf_xdp_link_attach_failed(extack._msg); goto out_put_dev; } fd = bpf_link_settle(&link_primer); /* link itself doesn't hold dev's refcnt to not complicate shutdown */ dev_put(dev); return fd; unlock: rtnl_unlock(); out_put_dev: dev_put(dev); return err; } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @expected_fd: old program fd that userspace expects to replace or clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags); struct bpf_prog *new_prog = NULL, *old_prog = NULL; int err; ASSERT_RTNL(); if (fd >= 0) { new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(new_prog)) return PTR_ERR(new_prog); } if (expected_fd >= 0) { old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP, mode != XDP_MODE_SKB); if (IS_ERR(old_prog)) { err = PTR_ERR(old_prog); old_prog = NULL; goto err_out; } } err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) bpf_prog_put(new_prog); if (old_prog) bpf_prog_put(old_prog); return err; } u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; ASSERT_RTNL(); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) /* The channel count is the idx plus 1. */ return i + 1; return 0; } /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace * @ifindex: requested ifindex, pass %0 to get one allocated * * Allocate a ifindex for a new device. Caller must either use the ifindex * to store the device (via list_netdevice()) or call dev_index_release() * to give the index up. * * Return: a suitable unique value for a new device interface number or -errno. */ static int dev_index_reserve(struct net *net, u32 ifindex) { int err; if (ifindex > INT_MAX) { DEBUG_NET_WARN_ON_ONCE(1); return -EINVAL; } if (!ifindex) err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL, xa_limit_31b, &net->ifindex, GFP_KERNEL); else err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL); if (err < 0) return err; return ifindex; } static void dev_index_release(struct net *net, int ifindex) { /* Expect only unused indexes, unlist_netdevice() removes the used */ WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS return current == cleanup_net_task; #else return false; #endif } /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); else netdev_features_change(lower); } } } static bool netdev_has_ip_or_hw_csum(netdev_features_t features) { netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; bool ip_csum = (features & ip_csum_mask) == ip_csum_mask; bool hw_csum = features & NETIF_F_HW_CSUM; return ip_csum || hw_csum; } static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IP_CSUM)) { netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); features &= ~NETIF_F_TSO; features &= ~NETIF_F_TSO_ECN; } if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IPV6_CSUM)) { netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); features &= ~NETIF_F_TSO6; } /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) features &= ~NETIF_F_TSO_MANGLEID; /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n"); features &= ~dev->gso_partial_features; } if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW. */ if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); features &= ~NETIF_F_GRO_HW; } } /* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) { netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_GRO_HW; } } if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); features &= ~NETIF_F_LRO; } if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); features &= ~NETIF_F_HW_TLS_TX; } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); features &= ~NETIF_F_HW_TLS_RX; } if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) { netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n"); features &= ~NETIF_F_GSO_UDP_L4; } return features; } int __netdev_update_features(struct net_device *dev) { struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; int err = -1; ASSERT_RTNL(); features = netdev_get_wanted_features(dev); if (dev->netdev_ops->ndo_fix_features) features = dev->netdev_ops->ndo_fix_features(dev, features); /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); else err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it */ return -1; } sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); if (!err) { netdev_features_t diff = features ^ dev->features; if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { dev->features = features; err |= vlan_get_rx_ctag_filter_info(dev); } else { vlan_drop_rx_ctag_filter_info(dev); } } if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { if (features & NETIF_F_HW_VLAN_STAG_FILTER) { dev->features = features; err |= vlan_get_rx_stag_filter_info(dev); } else { vlan_drop_rx_stag_filter_info(dev); } } dev->features = features; } return err < 0 ? 0 : 1; } /** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features. */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); /** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices. */ void netdev_change_features(struct net_device *dev) { __netdev_update_features(dev); netdev_features_change(dev); } EXPORT_SYMBOL(netdev_change_features); /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device). */ void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev) { if (rootdev->operstate == IF_OPER_DORMANT) netif_dormant_on(dev); else netif_dormant_off(dev); if (rootdev->operstate == IF_OPER_TESTING) netif_testing_on(dev); else netif_testing_off(dev); if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); int err = 0; BUG_ON(count < 1); rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; for (i = 0; i < count; i++) { rx[i].dev = dev; /* XDP RX-queue setup */ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } return 0; err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); kvfree(dev->_rx); dev->_rx = NULL; return err; } static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; for (i = 0; i < count; i++) xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL dql_init(&queue->dql, HZ); #endif } static void netif_free_tx_queues(struct net_device *dev) { kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; size_t sz = count * sizeof(*tx); if (count < 1 || count > 0xffff) return -EINVAL; tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); return 0; } void netif_tx_stop_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_stop_queue(txq); } } EXPORT_SYMBOL(netif_tx_stop_all_queues); static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; /* Drivers implementing ndo_get_peer_dev must support tstat * accounting, so that skb_do_redirect() can bump the dev's * RX stats upon network namespace switch. */ if (dev->netdev_ops->ndo_get_peer_dev && dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) return -EOPNOTSUPP; switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; case NETDEV_PCPU_STAT_LSTATS: v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); break; case NETDEV_PCPU_STAT_TSTATS: v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); break; case NETDEV_PCPU_STAT_DSTATS: v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); break; default: return -EINVAL; } return v ? 0 : -ENOMEM; } static void netdev_do_free_pcpu_stats(struct net_device *dev) { switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return; case NETDEV_PCPU_STAT_LSTATS: free_percpu(dev->lstats); break; case NETDEV_PCPU_STAT_TSTATS: free_percpu(dev->tstats); break; case NETDEV_PCPU_STAT_DSTATS: free_percpu(dev->dstats); break; } } static void netdev_free_phy_link_topology(struct net_device *dev) { struct phy_link_topology *topo = dev->link_topo; if (IS_ENABLED(CONFIG_PHYLIB) && topo) { xa_destroy(&topo->phys); kfree(topo); dev->link_topo = NULL; } } /** * register_netdevice() - register a network device * @dev: device to register * * Take a prepared network device structure and make it externally accessible. * A %NETDEV_REGISTER message is sent to the netdev notifier chain. * Callers must hold the rtnl lock - you may want register_netdev() * instead of this. */ int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); might_sleep(); /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); ret = ethtool_check_ops(dev->ethtool_ops); if (ret) return ret; /* rss ctx ID 0 is reserved for the default context, start from 1 */ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1); mutex_init(&dev->ethtool->rss_lock); spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; goto err_free_name; } } if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; goto err_uninit; } ret = netdev_do_alloc_pcpu_stats(dev); if (ret) goto err_uninit; ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; /* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later. */ if (dev->hw_features & NETIF_F_TSO) dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO) dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO) dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; /* Make NETIF_F_SG inheritable to tunnel devices. */ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ dev->mpls_features |= NETIF_F_SG; ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) goto err_ifindex_release; ret = netdev_register_kobject(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); netdev_unlock(dev); if (ret) goto err_uninit_notify; __netdev_update_features(dev); /* * Default initial state at registry is that the * device is present. */ set_bit(__LINK_STATE_PRESENT, &dev->state); linkwatch_init_dev(dev); dev_init_scheduler(dev); netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value). */ if (dev->addr_assign_type == NET_ADDR_PERM) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ dev->needs_free_netdev = false; unregister_netdevice_queue(dev, NULL); goto out; } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; err_uninit_notify: call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); err_free_pcpu: netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); err_free_name: netdev_name_node_free(dev->name_node); goto out; } EXPORT_SYMBOL(register_netdevice); /* Initialize the core of a dummy net device. * The setup steps dummy netdevs need which normal netdevs get by going * through register_netdevice(). */ static void init_dummy_netdev(struct net_device *dev) { /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ } /** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ int register_netdev(struct net_device *dev) { struct net *net = dev_net(dev); int err; if (rtnl_net_lock_killable(net)) return -EINTR; err = register_netdevice(dev); rtnl_net_unlock(net); return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { #ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; #else return refcount_read(&dev->dev_refcnt); #endif } EXPORT_SYMBOL(netdev_refcnt_read); int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs_any - wait until all references are gone. * @list: list of net_devices to wait on * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; struct net_device *dev; int wait = 0; rebroadcast_time = warning_time = jiffies; list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ list_for_each_entry(dev, list, todo_list) call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); list_for_each_entry(dev, list, todo_list) if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device. */ linkwatch_run_queue(); break; } __rtnl_unlock(); rebroadcast_time = jiffies; } rcu_barrier(); if (!wait) { wait = WAIT_REFS_MIN_MSECS; } else { msleep(wait); wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } list_for_each_entry(dev, list, todo_list) if (netdev_refcnt_read(dev) == 1) return dev; if (time_after(jiffies, warning_time + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); ref_tracker_dir_print(&dev->refcnt_tracker, 10); } warning_time = jiffies; } } } /* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed. */ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { dev = list_first_entry(&unlink_list, struct net_device, unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { netdev_WARN(dev, "run_todo but not unregistering\n"); list_del(&dev->todo_list); continue; } netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); netdev_unlock(dev); linkwatch_sync_dev(dev); } cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) wake_up(&netdev_unregistering_wq); } /* Collate per-cpu network dstats statistics * * Read per-cpu network statistics from dev->dstats and populate the related * fields in @s. */ static void dev_fetch_dstats(struct rtnl_link_stats64 *s, const struct pcpu_dstats __percpu *dstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, rx_drops; u64 tx_packets, tx_bytes, tx_drops; const struct pcpu_dstats *stats; unsigned int start; stats = per_cpu_ptr(dstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); rx_drops = u64_stats_read(&stats->rx_drops); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); tx_drops = u64_stats_read(&stats->tx_drops); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->rx_dropped += rx_drops; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; s->tx_dropped += tx_drops; } } /* ndo_get_stats64 implementation for dtstats-based accounting. * * Populate @s from dev->stats and dev->dstats. This is used internally by the * core for NETDEV_PCPU_STAT_DSTAT-type stats collection. */ static void dev_get_dstats64(const struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_dstats(s, dev->dstats); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = (unsigned long)atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); } EXPORT_SYMBOL(netdev_stats_to_stats64); static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc( struct net_device *dev) { struct net_device_core_stats __percpu *p; p = alloc_percpu_gfp(struct net_device_core_stats, GFP_ATOMIC | __GFP_NOWARN); if (p && cmpxchg(&dev->core_stats, NULL, p)) free_percpu(p); /* This READ_ONCE() pairs with the cmpxchg() above */ return READ_ONCE(dev->core_stats); } noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) { /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); unsigned long __percpu *field; if (unlikely(!p)) { p = netdev_core_stats_alloc(dev); if (!p) return; } field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); /** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used. */ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; /* * IPv{4,6} and udp tunnels share common stat helpers and use * different stat type (NETDEV_PCPU_STAT_TSTATS vs * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. */ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != offsetof(struct pcpu_dstats, rx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != offsetof(struct pcpu_dstats, rx_packets)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != offsetof(struct pcpu_dstats, tx_bytes)); BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != offsetof(struct pcpu_dstats, tx_packets)); if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { dev_get_tstats64(dev, storage); } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { dev_get_dstats64(dev, storage); } else { netdev_stats_to_stats64(storage, &dev->stats); } /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ p = READ_ONCE(dev->core_stats); if (p) { const struct net_device_core_stats *core_stats; int i; for_each_possible_cpu(i) { core_stats = per_cpu_ptr(p, i); storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); } } return storage; } EXPORT_SYMBOL(dev_get_stats); /** * dev_fetch_sw_netstats - get per-cpu network device statistics * @s: place to store stats * @netstats: per-cpu network stats to read from * * Read per-cpu network statistics and populate the related fields in @s. */ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats) { int cpu; for_each_possible_cpu(cpu) { u64 rx_packets, rx_bytes, tx_packets, tx_bytes; const struct pcpu_sw_netstats *stats; unsigned int start; stats = per_cpu_ptr(netstats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; s->tx_packets += tx_packets; s->tx_bytes += tx_bytes; } } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); /** * dev_get_tstats64 - ndo_get_stats64 implementation * @dev: device to get statistics from * @s: place to store stats * * Populate @s from dev->stats and dev->tstats. Can be used as * ndo_get_stats64() callback. */ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) { netdev_stats_to_stats64(s, &dev->stats); dev_fetch_sw_netstats(s, dev->tstats); } EXPORT_SYMBOL_GPL(dev_get_tstats64); struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc); rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; } static const struct ethtool_ops default_ethtool_ops; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops) { if (dev->ethtool_ops == &default_ethtool_ops) dev->ethtool_ops = ops; } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); /** * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default * @dev: netdev to enable the IRQ coalescing on * * Sets a conservative default for SW IRQ coalescing. Users can use * sysfs attributes to override the default values. */ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) { WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { netdev_set_gro_flush_timeout(dev, 20000); netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; size_t napi_config_sz; unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } maxqs = max(txqs, rxqs); dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) return NULL; dev->priv_len = sizeof_priv; ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP dev->nested_level = 0; INIT_LIST_HEAD(&dev->unlink_list); #endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); INIT_LIST_HEAD(&dev->net_notifier_list); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif mutex_init(&dev->lock); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT); if (!dev->ethtool) goto free_all; dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg) goto free_all; dev->cfg_pending = dev->cfg; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) goto free_all; strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; nf_hook_netdev_init(dev); return dev; free_all: free_netdev(dev); return NULL; free_pcpu: #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: #endif kvfree(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); static void netdev_napi_exit(struct net_device *dev) { if (!list_empty(&dev->napi_list)) { struct napi_struct *p, *n; netdev_lock(dev); list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) __netif_napi_del_locked(p); netdev_unlock(dev); synchronize_net(); } kvfree(dev->napi_config); } /** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context. */ void free_netdev(struct net_device *dev) { might_sleep(); /* When called immediately after register_netdevice() failed the unwind * handling may still be dismantling the device. Handle that case by * deferring the free. */ if (dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); dev->needs_free_netdev = true; return; } WARN_ON(dev->cfg != dev->cfg_pending); kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); netdev_napi_exit(dev); netif_del_cpu_rmap(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif free_percpu(dev->core_stats); dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; netdev_free_phy_link_topology(dev); mutex_destroy(&dev->lock); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { kvfree(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); } EXPORT_SYMBOL(free_netdev); /** * alloc_netdev_dummy - Allocate and initialize a dummy net device. * @sizeof_priv: size of private data to allocate space for * * Return: the allocated net_device on success, NULL otherwise */ struct net_device *alloc_netdev_dummy(int sizeof_priv) { return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, init_dummy_netdev); } EXPORT_SYMBOL_GPL(alloc_netdev_dummy); /** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting. */ void synchronize_net(void) { might_sleep(); if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); static void netdev_rss_contexts_free(struct net_device *dev) { struct ethtool_rxfh_context *ctx; unsigned long context; mutex_lock(&dev->ethtool->rss_lock); xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { struct ethtool_rxfh_param rxfh; rxfh.indir = ethtool_rxfh_context_indir(ctx); rxfh.key = ethtool_rxfh_context_key(ctx); rxfh.hfunc = ctx->hfunc; rxfh.input_xfrm = ctx->input_xfrm; rxfh.rss_context = context; rxfh.rss_delete = true; xa_erase(&dev->ethtool->rss_ctx, context); if (dev->ethtool_ops->create_rxfh_context) dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); else dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); kfree(ctx); } xa_destroy(&dev->ethtool->rss_ctx); mutex_unlock(&dev->ethtool->rss_lock); } /** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); if (head) { list_move_tail(&dev->unreg_list, head); } else { LIST_HEAD(single); list_add(&dev->unreg_list, &single); unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); static void dev_memory_provider_uninstall(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->real_num_rx_queues; i++) { struct netdev_rx_queue *rxq = &dev->_rx[i]; struct pp_memory_provider_params *p = &rxq->mp_params; if (p->mp_ops && p->mp_ops->uninstall) p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); } } void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); if (list_empty(head)) return; list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); WARN_ON(1); list_del(&dev->unreg_list); continue; } dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } flush_all_backlogs(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); netdev_rss_contexts_free(dev); call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); mutex_destroy(&dev->ethtool->rss_lock); net_shaper_flush_netdev(dev); if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); #ifdef CONFIG_XPS /* Remove XPS queueing entries */ netif_reset_xps_queues_gt(dev, 0); #endif } synchronize_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); cnt++; } atomic_add(cnt, &dev_unreg_count); list_del(head); } /** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { unregister_netdevice_many_notify(head, 0, NULL); } EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice. */ void unregister_netdev(struct net_device *dev) { rtnl_net_dev_lock(dev); unregister_netdevice(dev); rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); /** * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * @new_ifindex: If not zero, specifies device index in the target * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. */ int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); char new_name[IFNAMSIZ] = {}; int err, new_nsid; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; if (dev->netns_local) goto out; /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out; /* Get out if there is nothing todo */ err = 0; if (net_eq(net_old, net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); if (err < 0) goto out; } /* Check that none of the altnames conflicts. */ err = -EEXIST; netdev_for_each_altname(dev, name_node) if (netdev_name_in_use(net, name_node->name)) goto out; /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); if (err < 0) goto out; } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); if (err < 0) goto out; new_ifindex = err; } /* * And now a mini version of register_netdevice unregister_netdevice. */ /* If device is running close it first. */ dev_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); /* Move per-net netdevice notifiers that are following the netdevice */ move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; if (new_name[0]) { /* Rename the netdev to prepared name */ write_seqlock_bh(&netdev_rename_lock); strscpy(dev->name, new_name, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); } /* Fixup kobjects */ dev_set_uevent_suppress(&dev->dev, 1); err = device_rename(&dev->dev, dev->name); dev_set_uevent_suppress(&dev->dev, 0); WARN_ON(err); /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); /* Adapt owner in case owning user namespace of target network * namespace is different from the original one. */ err = netdev_change_owner(dev, net_old, net); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; out: return err; } EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); oldsd = &per_cpu(softnet_data, oldcpu); /* Find end of our completion_queue. */ list_skb = &sd->completion_queue; while (*list_skb) list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */ *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; /* Append output queue from offline CPU. */ if (oldsd->output_queue) { *sd->output_queue_tailp = oldsd->output_queue; sd->output_queue_tailp = oldsd->output_queue_tailp; oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later. */ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct, poll_list); list_del_init(&napi->poll_list); if (napi->poll == process_backlog) napi->state &= NAPIF_STATE_THREADED; else ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); if (!use_backlog_threads()) { #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */ net_rps_send_ipi(remsd); } /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); rps_input_queue_head_incr(oldsd); } return 0; } /** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM) all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name; net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx; xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1); RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); return 0; err_idx: kfree(net->dev_name_head); err_name: return -ENOMEM; } /** * netdev_drivername - network driver for the device * @dev: network device * * Determine network driver for device. */ const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; const char *empty = ""; parent = dev->dev.parent; if (!parent) return empty; driver = parent->driver; if (driver && driver->name) return driver->name; return empty; } static void __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { if (dev && dev->dev.parent) { dev_printk_emit(level[1] - '0', dev->dev.parent, "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), netdev_reg_state(dev), vaf); } else if (dev) { printk("%s%s%s: %pV", level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { printk("%s(NULL net_device): %pV", level, vaf); } } void netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __netdev_printk(level, dev, &vaf); va_end(args); } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ void func(const struct net_device *dev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_netdev_printk_level(netdev_emerg, KERN_EMERG); define_netdev_printk_level(netdev_alert, KERN_ALERT); define_netdev_printk_level(netdev_crit, KERN_CRIT); define_netdev_printk_level(netdev_err, KERN_ERR); define_netdev_printk_level(netdev_warn, KERN_WARNING); define_netdev_printk_level(netdev_notice, KERN_NOTICE); define_netdev_printk_level(netdev_info, KERN_INFO); static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); xa_destroy(&net->dev_by_index); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, }; static void __net_exit default_device_exit_net(struct net *net) { struct netdev_name_node *name_node, *tmp; struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); BUG(); } } } static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ struct net_device *dev; struct net *net; LIST_HEAD(dev_kill_list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { default_device_exit_net(net); cond_resched(); } list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { .exit_batch = default_device_exit_batch, }; static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq); #ifdef CONFIG_XPS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps); #endif #ifdef CONFIG_NETFILTER_EGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net); #ifdef CONFIG_NETPOLL CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo); #endif #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ #define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) static int net_page_pool_create(int cpuid) { #if IS_ENABLED(CONFIG_PAGE_POOL) struct page_pool_params page_pool_params = { .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, .flags = PP_FLAG_SYSTEM_POOL, .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; err = xdp_reg_page_pool(pp_ptr); if (err) { page_pool_destroy(pp_ptr); return err; } per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; } static int backlog_napi_should_run(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state); } static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); napi_threaded_poll_loop(&sd->backlog); } static void backlog_napi_setup(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); struct napi_struct *napi = &sd->backlog; napi->thread = this_cpu_read(backlog_napi); set_bit(NAPI_STATE_THREADED, &napi->state); } static struct smp_hotplug_thread backlog_threads = { .store = &backlog_napi, .thread_should_run = backlog_napi_should_run, .thread_fn = run_backlog_napi, .thread_comm = "backlog_napi/%u", .setup = backlog_napi_setup, }; /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); net_dev_struct_check(); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ flush_backlogs_fallback = flush_backlogs_alloc(); if (!flush_backlogs_fallback) goto out; for_each_possible_cpu(i) { struct softnet_data *sd = &per_cpu(softnet_data, i); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); if (net_page_pool_create(i)) goto out; } if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; /* avoid static key IPIs to isolated CPUs */ if (housekeeping_enabled(HK_TYPE_MISC)) net_enable_timestamp(); out: if (rc < 0) { for_each_possible_cpu(i) { struct page_pool *pp_ptr; pp_ptr = per_cpu(system_page_pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } } return rc; } subsys_initcall(net_dev_init); |
158 158 159 286 284 159 188 188 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/clockchips.h> #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); /* Protection for unbind operations */ static DEFINE_MUTEX(clockevents_mutex); struct ce_unbind { struct clock_event_device *ce; int res; }; static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, bool ismax) { u64 clc = (u64) latch << evt->shift; u64 rnd; if (WARN_ON(!evt->mult)) evt->mult = 1; rnd = (u64) evt->mult - 1; /* * Upper bound sanity check. If the backwards conversion is * not equal latch, we know that the above shift overflowed. */ if ((clc >> evt->shift) != (u64)latch) clc = ~0ULL; /* * Scaled math oddities: * * For mult <= (1 << shift) we can safely add mult - 1 to * prevent integer rounding loss. So the backwards conversion * from nsec to device ticks will be correct. * * For mult > (1 << shift), i.e. device frequency is > 1GHz we * need to be careful. Adding mult - 1 will result in a value * which when converted back to device ticks can be larger * than latch by up to (mult - 1) >> shift. For the min_delta * calculation we still want to apply this in order to stay * above the minimum device ticks limit. For the upper limit * we would end up with a latch value larger than the upper * limit of the device, so we omit the add to stay below the * device upper boundary. * * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); /* Deltas less than 1usec are pointless noise */ return clc > 1000 ? clc : 1000; } /** * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * * Math helper, returns latch value converted to nanoseconds (bound checked) */ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: if (dev->set_state_shutdown) return dev->set_state_shutdown(dev); return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; if (dev->set_state_periodic) return dev->set_state_periodic(dev); return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; if (dev->set_state_oneshot) return dev->set_state_oneshot(dev); return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) return dev->set_state_oneshot_stopped(dev); else return -ENOSYS; default: return -ENOSYS; } } /** * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { if (WARN_ON(!dev->mult)) dev->mult = 1; } } } /** * clockevents_shutdown - shutdown the device and clear next_event * @dev: device to shutdown */ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; } /** * clockevents_tick_resume - Resume the tick device before using it again * @dev: device to resume */ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; if (dev->tick_resume) ret = dev->tick_resume(dev); return ret; } #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** * clockevents_increase_min_delta - raise minimum delta of a clock event device * @dev: device to increase the minimum delta * * Returns 0 on success, -ETIME when the minimum delta reached the limit. */ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } if (dev->min_delta_ns < 5000) dev->min_delta_ns = 5000; else dev->min_delta_ns += dev->min_delta_ns >> 1; if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; printk_deferred(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta; int i; for (i = 0;;) { delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; if (++i > 2) { /* * We tried 3 times to program the device with the * given min_delta_ns. Try to increase the minimum * delta, if that fails as well get out of here. */ if (clockevents_increase_min_delta(dev)) return -ETIME; i = 0; } } } #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta = 0; int i; for (i = 0; i < 10; i++) { delta += dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; } return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program * @expires: absolute expiry time (monotonic clock) * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { unsigned long long clc; int64_t delta; int rc; if (WARN_ON_ONCE(expires < 0)) return -ETIME; dev->next_event = expires; if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); clc = ((unsigned long long) delta * dev->mult) >> dev->shift; rc = dev->set_next_event((unsigned long) clc, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* * Called after a clockevent has been added which might * have replaced a current regular or broadcast device. A * released normal device might be a suitable replacement * for the current broadcast device. Similarly a released * broadcast device might be a suitable replacement for a * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; /* * Keep iterating as long as tick_check_new_device() * replaces a device. */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } /* * Try to install a replacement clock event device */ static int clockevents_replace(struct clock_event_device *ced) { struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) continue; if (!try_module_get(dev->owner)) continue; if (newdev) module_put(newdev->owner); newdev = dev; } if (newdev) { tick_install_replacement(newdev); list_del_init(&ced->list); } return newdev ? 0 : -EBUSY; } /* * Called with clockevents_mutex and clockevents_lock held */ static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* * SMP function call to unbind a device */ static void __clockevents_unbind(void *arg) { struct ce_unbind *cu = arg; int res; raw_spin_lock(&clockevents_lock); res = __clockevents_try_unbind(cu->ce, smp_processor_id()); if (res == -EAGAIN) res = clockevents_replace(cu->ce); cu->res = res; raw_spin_unlock(&clockevents_lock); } /* * Issues smp function call to unbind a per cpu device. Called with * clockevents_mutex held. */ static int clockevents_unbind(struct clock_event_device *ced, int cpu) { struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); return cu.res; } /* * Unbind a clockevents device. */ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) { int ret; mutex_lock(&clockevents_mutex); ret = clockevents_unbind(ced, cpu); mutex_unlock(&clockevents_mutex); return ret; } EXPORT_SYMBOL_GPL(clockevents_unbind_device); /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } if (dev->cpumask == cpu_all_mask) { WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", dev->name); dev->cpumask = cpu_possible_mask; } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * Calculate the maximum number of seconds we can sleep. Limit * to 10 minutes for hardware which can program more than * 32bit ticks so we still get reasonable conversion values. */ sec = dev->max_delta_ticks; do_div(sec, freq); if (!sec) sec = 1; else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); } EXPORT_SYMBOL_GPL(clockevents_config_and_register); int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify * @freq: new device frequency * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per * cpu timer events. If called for the broadcast device the core takes * care of serialization. * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { unsigned long flags; int ret; local_irq_save(flags); ret = tick_broadcast_update_freq(dev, freq); if (ret == -ENODEV) ret = __clockevents_update_freq(dev, freq); local_irq_restore(flags); return ret; } /* * Noop handler when we shut down an event device */ void clockevents_handle_noop(struct clock_event_device *dev) { } /** * clockevents_exchange_device - release and request clock devices * @old: device to release (can be NULL) * @new: device to request (can be NULL) * * Called from various tick functions with clockevents_lock held and * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_move(&old->list, &clockevents_released); } if (new) { BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } /** * clockevents_suspend - suspend clock devices */ void clockevents_suspend(void) { struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } /** * clockevents_resume - resume clock devices */ void clockevents_resume(void) { struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } #ifdef CONFIG_HOTPLUG_CPU /** * tick_offline_cpu - Shutdown all clock events related * to this CPU and take it out of the * broadcast mechanism. * @cpu: The outgoing CPU * * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { struct clock_event_device *dev, *tmp; raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); /* * Now check whether the CPU has left unused per cpu devices */ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } raw_spin_unlock(&clockevents_lock); } #endif #ifdef CONFIG_SYSFS static const struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); static ssize_t current_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tick_device *td; ssize_t count = 0; raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ static ssize_t unbind_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); list_for_each_entry(iter, &clockevent_devices, list) { if (!strcmp(iter->name, name)) { ret = __clockevents_try_unbind(iter, dev->id); ce = iter; break; } } raw_spin_unlock_irq(&clockevents_lock); /* * We hold clockevents_mutex, so ce can't go away */ if (ret == -EAGAIN) ret = clockevents_unbind(ce, dev->id); mutex_unlock(&clockevents_mutex); return ret ? ret : count; } static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", .id = 0, .bus = &clockevents_subsys, }; static struct tick_device *tick_get_tick_dev(struct device *dev) { return dev == &tick_bc_dev ? tick_get_broadcast_device() : &per_cpu(tick_cpu_device, dev->id); } static __init int tick_broadcast_init_sysfs(void) { int err = device_register(&tick_bc_dev); if (!err) err = device_create_file(&tick_bc_dev, &dev_attr_current_device); return err; } #else static struct tick_device *tick_get_tick_dev(struct device *dev) { return &per_cpu(tick_cpu_device, dev->id); } static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif static int __init tick_init_sysfs(void) { int cpu; for_each_possible_cpu(cpu) { struct device *dev = &per_cpu(tick_percpu_dev, cpu); int err; dev->id = cpu; dev->bus = &clockevents_subsys; err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); if (!err) err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } return tick_broadcast_init_sysfs(); } static int __init clockevents_init_sysfs(void) { int err = subsys_system_register(&clockevents_subsys, NULL); if (!err) err = tick_init_sysfs(); return err; } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */ |
1738 1738 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 | // SPDX-License-Identifier: GPL-2.0-only /* * ACPI device specific properties support. * * Copyright (C) 2014 - 2023, Intel Corporation * All rights reserved. * * Authors: Mika Westerberg <mika.westerberg@linux.intel.com> * Darren Hart <dvhart@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Sakari Ailus <sakari.ailus@linux.intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> #include <linux/device.h> #include <linux/export.h> #include "internal.h" static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj); /* * The GUIDs here are made equivalent to each other in order to avoid extra * complexity in the properties handling code, with the caveat that the * kernel will accept certain combinations of GUID and properties that are * not defined without a warning. For instance if any of the properties * from different GUID appear in a property list of another, it will be * accepted by the kernel. Firmware validation tools should catch these. * * References: * * [1] UEFI DSD Guide. * https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc */ static const guid_t prp_guids[] = { /* ACPI _DSD device properties GUID [1]: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */ GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c, 0x8a, 0x91, 0xbc, 0x9b, 0xbf, 0x4a, 0xa3, 0x01), /* Hotplug in D3 GUID: 6211e2c0-58a3-4af3-90e1-927a4e0c55a4 */ GUID_INIT(0x6211e2c0, 0x58a3, 0x4af3, 0x90, 0xe1, 0x92, 0x7a, 0x4e, 0x0c, 0x55, 0xa4), /* External facing port GUID: efcc06cc-73ac-4bc3-bff0-76143807c389 */ GUID_INIT(0xefcc06cc, 0x73ac, 0x4bc3, 0xbf, 0xf0, 0x76, 0x14, 0x38, 0x07, 0xc3, 0x89), /* Thunderbolt GUID for IMR_VALID: c44d002f-69f9-4e7d-a904-a7baabdf43f7 */ GUID_INIT(0xc44d002f, 0x69f9, 0x4e7d, 0xa9, 0x04, 0xa7, 0xba, 0xab, 0xdf, 0x43, 0xf7), /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ GUID_INIT(0x5025030f, 0x842f, 0x4ab4, 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID [1]: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ static const guid_t ads_guid = GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6, 0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b); /* ACPI _DSD data buffer GUID [1]: edb12dd0-363d-4085-a3d2-49522ca160c4 */ static const guid_t buffer_prop_guid = GUID_INIT(0xedb12dd0, 0x363d, 0x4085, 0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4); static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent); static bool acpi_extract_properties(acpi_handle handle, union acpi_object *desc, struct acpi_device_data *data); static bool acpi_nondev_subnode_extract(union acpi_object *desc, acpi_handle handle, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_data_node *dn; bool result; if (acpi_graph_ignore_port(handle)) return false; dn = kzalloc(sizeof(*dn), GFP_KERNEL); if (!dn) return false; dn->name = link->package.elements[0].string.pointer; fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops); dn->parent = parent; INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); result = acpi_extract_properties(handle, desc, &dn->data); if (handle) { acpi_handle scope; acpi_status status; /* * The scope for the subnode object lookup is the one of the * namespace node (device) containing the object that has * returned the package. That is, it's the scope of that * object's parent. */ status = acpi_get_parent(handle, &scope); if (ACPI_SUCCESS(status) && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data, &dn->fwnode)) { result = true; } if (result) { dn->handle = handle; dn->data.pointer = desc; list_add_tail(&dn->sibling, list); return true; } kfree(dn); acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); return false; } static bool acpi_nondev_subnode_data_ok(acpi_handle handle, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; acpi_status status; status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) return false; if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, parent)) return true; ACPI_FREE(buf.pointer); return false; } static bool acpi_nondev_subnode_ok(acpi_handle scope, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { acpi_handle handle; acpi_status status; if (!scope) return false; status = acpi_get_handle(scope, link->package.elements[1].string.pointer, &handle); if (ACPI_FAILURE(status)) return false; return acpi_nondev_subnode_data_ok(handle, link, list, parent); } static bool acpi_add_nondev_subnodes(acpi_handle scope, union acpi_object *links, struct list_head *list, struct fwnode_handle *parent) { bool ret = false; int i; for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; acpi_handle handle; bool result; link = &links->package.elements[i]; /* Only two elements allowed. */ if (link->package.count != 2) continue; /* The first one must be a string. */ if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; /* The second one may be a string, a reference or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: result = acpi_nondev_subnode_ok(scope, link, list, parent); break; case ACPI_TYPE_LOCAL_REFERENCE: handle = link->package.elements[1].reference.handle; result = acpi_nondev_subnode_data_ok(handle, link, list, parent); break; case ACPI_TYPE_PACKAGE: desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); break; default: result = false; break; } ret = ret || result; } return ret; } static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent) { int i; /* Look for the ACPI data subnodes GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *links; guid = &desc->package.elements[i]; links = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || links->type != ACPI_TYPE_PACKAGE) break; if (!guid_equal((guid_t *)guid->buffer.pointer, &ads_guid)) continue; return acpi_add_nondev_subnodes(scope, links, &data->subnodes, parent); } return false; } static bool acpi_property_value_ok(const union acpi_object *value) { int j; /* * The value must be an integer, a string, a reference, or a package * whose every element must be an integer, a string, or a reference. */ switch (value->type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: return true; case ACPI_TYPE_PACKAGE: for (j = 0; j < value->package.count; j++) switch (value->package.elements[j].type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: continue; default: return false; } return true; } return false; } static bool acpi_properties_format_valid(const union acpi_object *properties) { int i; for (i = 0; i < properties->package.count; i++) { const union acpi_object *property; property = &properties->package.elements[i]; /* * Only two elements allowed, the first one must be a string and * the second one has to satisfy certain conditions. */ if (property->package.count != 2 || property->package.elements[0].type != ACPI_TYPE_STRING || !acpi_property_value_ok(&property->package.elements[1])) return false; } return true; } static void acpi_init_of_compatible(struct acpi_device *adev) { const union acpi_object *of_compatible; int ret; ret = acpi_data_get_property_array(&adev->data, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { ret = acpi_dev_get_property(adev, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { struct acpi_device *parent; parent = acpi_dev_parent(adev); if (parent && parent->flags.of_compatible_ok) goto out; return; } } adev->data.of_compatible = of_compatible; out: adev->flags.of_compatible_ok = 1; } static bool acpi_is_property_guid(const guid_t *guid) { int i; for (i = 0; i < ARRAY_SIZE(prp_guids); i++) { if (guid_equal(guid, &prp_guids[i])) return true; } return false; } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties) { struct acpi_device_properties *props; props = kzalloc(sizeof(*props), GFP_KERNEL); if (props) { INIT_LIST_HEAD(&props->list); props->guid = guid; props->properties = properties; list_add_tail(&props->list, &data->properties); } return props; } static void acpi_nondev_subnode_tag(acpi_handle handle, void *context) { } static void acpi_untie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { acpi_detach_data(dn->handle, acpi_nondev_subnode_tag); acpi_untie_nondev_subnodes(&dn->data); } } static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { acpi_status status; bool ret; status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn); if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) { acpi_handle_err(dn->handle, "Can't tag data node\n"); return false; } ret = acpi_tie_nondev_subnodes(&dn->data); if (!ret) return ret; } return true; } static void acpi_data_add_buffer_props(acpi_handle handle, struct acpi_device_data *data, union acpi_object *properties) { struct acpi_device_properties *props; union acpi_object *package; size_t alloc_size; unsigned int i; u32 *count; if (check_mul_overflow((size_t)properties->package.count, sizeof(*package) + sizeof(void *), &alloc_size) || check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size, &alloc_size)) { acpi_handle_warn(handle, "can't allocate memory for %u buffer props", properties->package.count); return; } props = kvzalloc(alloc_size, GFP_KERNEL); if (!props) return; props->guid = &buffer_prop_guid; props->bufs = (void *)(props + 1); props->properties = (void *)(props->bufs + properties->package.count); /* Outer package */ package = props->properties; package->type = ACPI_TYPE_PACKAGE; package->package.elements = package + 1; count = &package->package.count; *count = 0; /* Inner packages */ package++; for (i = 0; i < properties->package.count; i++) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; union acpi_object *property = &properties->package.elements[i]; union acpi_object *prop, *obj, *buf_obj; acpi_status status; if (property->type != ACPI_TYPE_PACKAGE || property->package.count != 2) { acpi_handle_warn(handle, "buffer property %u has %u entries\n", i, property->package.count); continue; } prop = &property->package.elements[0]; obj = &property->package.elements[1]; if (prop->type != ACPI_TYPE_STRING || obj->type != ACPI_TYPE_STRING) { acpi_handle_warn(handle, "wrong object types %u and %u\n", prop->type, obj->type); continue; } status = acpi_evaluate_object_typed(handle, obj->string.pointer, NULL, &buf, ACPI_TYPE_BUFFER); if (ACPI_FAILURE(status)) { acpi_handle_warn(handle, "can't evaluate \"%*pE\" as buffer\n", obj->string.length, obj->string.pointer); continue; } package->type = ACPI_TYPE_PACKAGE; package->package.elements = prop; package->package.count = 2; buf_obj = buf.pointer; /* Replace the string object with a buffer object */ obj->type = ACPI_TYPE_BUFFER; obj->buffer.length = buf_obj->buffer.length; obj->buffer.pointer = buf_obj->buffer.pointer; props->bufs[i] = buf.pointer; package++; (*count)++; } if (*count) list_add(&props->list, &data->properties); else kvfree(props); } static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data) { int i; if (desc->package.count % 2) return false; /* Look for the device properties GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *properties; guid = &desc->package.elements[i]; properties = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || properties->type != ACPI_TYPE_PACKAGE) break; if (guid_equal((guid_t *)guid->buffer.pointer, &buffer_prop_guid)) { acpi_data_add_buffer_props(scope, data, properties); continue; } if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer)) continue; /* * We found the matching GUID. Now validate the format of the * package immediately following it. */ if (!acpi_properties_format_valid(properties)) continue; acpi_data_add_props(data, (const guid_t *)guid->buffer.pointer, properties); } return !list_empty(&data->properties); } void acpi_init_properties(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; struct acpi_hardware_id *hwid; acpi_status status; bool acpi_of = false; INIT_LIST_HEAD(&adev->data.properties); INIT_LIST_HEAD(&adev->data.subnodes); if (!adev->handle) return; /* * Check if ACPI_DT_NAMESPACE_HID is present and inthat case we fill in * Device Tree compatible properties for this device. */ list_for_each_entry(hwid, &adev->pnp.ids, list) { if (!strcmp(hwid->id, ACPI_DT_NAMESPACE_HID)) { acpi_of = true; break; } } status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) goto out; if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) { adev->data.pointer = buf.pointer; if (acpi_of) acpi_init_of_compatible(adev); } if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data, acpi_fwnode_handle(adev))) adev->data.pointer = buf.pointer; if (!adev->data.pointer) { acpi_handle_debug(adev->handle, "Invalid _DSD data, skipping\n"); ACPI_FREE(buf.pointer); } else { if (!acpi_tie_nondev_subnodes(&adev->data)) acpi_untie_nondev_subnodes(&adev->data); } out: if (acpi_of && !adev->flags.of_compatible_ok) acpi_handle_info(adev->handle, ACPI_DT_NAMESPACE_HID " requires 'compatible' property\n"); if (!adev->data.pointer) acpi_extract_apple_properties(adev); } static void acpi_free_device_properties(struct list_head *list) { struct acpi_device_properties *props, *tmp; list_for_each_entry_safe(props, tmp, list, list) { u32 i; list_del(&props->list); /* Buffer data properties were separately allocated */ if (props->bufs) for (i = 0; i < props->properties->package.count; i++) ACPI_FREE(props->bufs[i]); kvfree(props); } } static void acpi_destroy_nondev_subnodes(struct list_head *list) { struct acpi_data_node *dn, *next; if (list_empty(list)) return; list_for_each_entry_safe_reverse(dn, next, list, sibling) { acpi_destroy_nondev_subnodes(&dn->data.subnodes); wait_for_completion(&dn->kobj_done); list_del(&dn->sibling); ACPI_FREE((void *)dn->data.pointer); acpi_free_device_properties(&dn->data.properties); kfree(dn); } } void acpi_free_properties(struct acpi_device *adev) { acpi_untie_nondev_subnodes(&adev->data); acpi_destroy_nondev_subnodes(&adev->data.subnodes); ACPI_FREE((void *)adev->data.pointer); adev->data.of_compatible = NULL; adev->data.pointer = NULL; acpi_free_device_properties(&adev->data.properties); } /** * acpi_data_get_property - return an ACPI property with given name * @data: ACPI device deta object to get the property from * @name: Name of the property * @type: Expected property type * @obj: Location to store the property value (if not %NULL) * * Look up a property with @name and store a pointer to the resulting ACPI * object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. These objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if property with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property value type doesn't match @type. */ static int acpi_data_get_property(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const struct acpi_device_properties *props; if (!data || !name) return -EINVAL; if (!data->pointer || list_empty(&data->properties)) return -EINVAL; list_for_each_entry(props, &data->properties, list) { const union acpi_object *properties; unsigned int i; properties = props->properties; for (i = 0; i < properties->package.count; i++) { const union acpi_object *propname, *propvalue; const union acpi_object *property; property = &properties->package.elements[i]; propname = &property->package.elements[0]; propvalue = &property->package.elements[1]; if (!strcmp(name, propname->string.pointer)) { if (type != ACPI_TYPE_ANY && propvalue->type != type) return -EPROTO; if (obj) *obj = propvalue; return 0; } } } return -EINVAL; } /** * acpi_dev_get_property - return an ACPI property with given name. * @adev: ACPI device to get the property from. * @name: Name of the property. * @type: Expected property type. * @obj: Location to store the property value (if not %NULL). */ int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL; } EXPORT_SYMBOL_GPL(acpi_dev_get_property); static const struct acpi_device_data * acpi_device_data_of_node(const struct fwnode_handle *fwnode) { if (is_acpi_device_node(fwnode)) { const struct acpi_device *adev = to_acpi_device_node(fwnode); return &adev->data; } if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return &dn->data; } return NULL; } /** * acpi_node_prop_get - return an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @valptr: Location to store a pointer to the property value (if not %NULL). */ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return acpi_data_get_property(acpi_device_data_of_node(fwnode), propname, ACPI_TYPE_ANY, (const union acpi_object **)valptr); } /** * acpi_data_get_property_array - return an ACPI array property with given name * @data: ACPI data object to get the property from * @name: Name of the property * @type: Expected type of array elements * @obj: Location to store a pointer to the property value (if not NULL) * * Look up an array property with @name and store a pointer to the resulting * ACPI object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. Those objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if array property (package) with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property is not a package or the type of its elements * doesn't match @type. */ static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const union acpi_object *prop; int ret, i; ret = acpi_data_get_property(data, name, ACPI_TYPE_PACKAGE, &prop); if (ret) return ret; if (type != ACPI_TYPE_ANY) { /* Check that all elements are of correct type. */ for (i = 0; i < prop->package.count; i++) if (prop->package.elements[i].type != type) return -EPROTO; } if (obj) *obj = prop; return 0; } static struct fwnode_handle * acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { if (is_acpi_data_node(child)) { if (acpi_data_node_match(child, childname)) return child; continue; } if (!strncmp(acpi_device_bid(to_acpi_device_node(child)), childname, ACPI_NAMESEG_SIZE)) return child; } return NULL; } static int acpi_get_ref_args(struct fwnode_reference_args *args, struct fwnode_handle *ref_fwnode, const union acpi_object **element, const union acpi_object *end, size_t num_args) { u32 nargs = 0, i; /* * Assume the following integer elements are all args. Stop counting on * the first reference (possibly represented as a string) or end of the * package arguments. In case of neither reference, nor integer, return * an error, we can't parse it. */ for (i = 0; (*element) + i < end && i < num_args; i++) { acpi_object_type type = (*element)[i].type; if (type == ACPI_TYPE_LOCAL_REFERENCE || type == ACPI_TYPE_STRING) break; if (type == ACPI_TYPE_INTEGER) nargs++; else return -EINVAL; } if (nargs > NR_FWNODE_REFERENCE_ARGS) return -EINVAL; if (args) { args->fwnode = ref_fwnode; args->nargs = nargs; for (i = 0; i < nargs; i++) args->args[i] = (*element)[i].integer.value; } (*element) += nargs; return 0; } static struct fwnode_handle *acpi_parse_string_ref(const struct fwnode_handle *fwnode, const char *refstring) { acpi_handle scope, handle; struct acpi_data_node *dn; struct acpi_device *device; acpi_status status; if (is_acpi_device_node(fwnode)) { scope = to_acpi_device_node(fwnode)->handle; } else if (is_acpi_data_node(fwnode)) { scope = to_acpi_data_node(fwnode)->handle; } else { pr_debug("Bad node type for node %pfw\n", fwnode); return NULL; } status = acpi_get_handle(scope, refstring, &handle); if (ACPI_FAILURE(status)) { acpi_handle_debug(scope, "Unable to get an ACPI handle for %s\n", refstring); return NULL; } device = acpi_fetch_acpi_dev(handle); if (device) return acpi_fwnode_handle(device); status = acpi_get_data_full(handle, acpi_nondev_subnode_tag, (void **)&dn, NULL); if (ACPI_FAILURE(status) || !dn) { acpi_handle_debug(handle, "Subnode not found\n"); return NULL; } return &dn->fwnode; } /** * __acpi_node_get_property_reference - returns handle to the referenced object * @fwnode: Firmware node to get the property from * @propname: Name of the property * @index: Index of the reference to return * @num_args: Maximum number of arguments after each reference * @args: Location to store the returned reference with optional arguments * (may be NULL) * * Find property with @name, verifify that it is a package containing at least * one object reference and if so, store the ACPI device object pointer to the * target object in @args->adev. If the reference includes arguments, store * them in the @args->args[] array. * * If there's more than one reference in the property value package, @index is * used to select the one to return. * * It is possible to leave holes in the property value set like in the * example below: * * Package () { * "cs-gpios", * Package () { * ^GPIO, 19, 0, 0, * ^GPIO, 20, 0, 0, * 0, * ^GPIO, 21, 0, 0, * } * } * * Calling this function with index %2 or index %3 return %-ENOENT. If the * property does not contain any more values %-ENOENT is returned. The NULL * entry must be single integer and preferably contain value %0. * * Return: %0 on success, negative error code on failure. */ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *propname, size_t index, size_t num_args, struct fwnode_reference_args *args) { const union acpi_object *element, *end; const union acpi_object *obj; const struct acpi_device_data *data; struct fwnode_handle *ref_fwnode; struct acpi_device *device; int ret, idx = 0; data = acpi_device_data_of_node(fwnode); if (!data) return -ENOENT; ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) return ret == -EINVAL ? -ENOENT : -EINVAL; switch (obj->type) { case ACPI_TYPE_LOCAL_REFERENCE: /* Plain single reference without arguments. */ if (index) return -ENOENT; device = acpi_fetch_acpi_dev(obj->reference.handle); if (!device) return -EINVAL; if (!args) return 0; args->fwnode = acpi_fwnode_handle(device); args->nargs = 0; return 0; case ACPI_TYPE_STRING: if (index) return -ENOENT; ref_fwnode = acpi_parse_string_ref(fwnode, obj->string.pointer); if (!ref_fwnode) return -EINVAL; args->fwnode = ref_fwnode; args->nargs = 0; return 0; case ACPI_TYPE_PACKAGE: /* * If it is not a single reference, then it is a package of * references, followed by number of ints as follows: * * Package () { REF, INT, REF, INT, INT } * * Here, REF may be either a local reference or a string. The * index argument is then used to determine which reference the * caller wants (along with the arguments). */ break; default: return -EINVAL; } if (index >= obj->package.count) return -ENOENT; element = obj->package.elements; end = element + obj->package.count; while (element < end) { switch (element->type) { case ACPI_TYPE_LOCAL_REFERENCE: device = acpi_fetch_acpi_dev(element->reference.handle); if (!device) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, acpi_fwnode_handle(device), &element, end, num_args); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_STRING: ref_fwnode = acpi_parse_string_ref(fwnode, element->string.pointer); if (!ref_fwnode) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, ref_fwnode, &element, end, num_args); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_INTEGER: if (idx == index) return -ENOENT; element++; break; default: return -EINVAL; } idx++; } return -ENOENT; } EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference); static int acpi_data_prop_read_single(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val) { const union acpi_object *obj; int ret = 0; if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj); else if (proptype == DEV_PROP_STRING) ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj); if (ret) return ret; switch (proptype) { case DEV_PROP_U8: if (obj->integer.value > U8_MAX) return -EOVERFLOW; if (val) *(u8 *)val = obj->integer.value; break; case DEV_PROP_U16: if (obj->integer.value > U16_MAX) return -EOVERFLOW; if (val) *(u16 *)val = obj->integer.value; break; case DEV_PROP_U32: if (obj->integer.value > U32_MAX) return -EOVERFLOW; if (val) *(u32 *)val = obj->integer.value; break; case DEV_PROP_U64: if (val) *(u64 *)val = obj->integer.value; break; case DEV_PROP_STRING: if (val) *(char **)val = obj->string.pointer; return 1; default: return -EINVAL; } /* When no storage provided return number of available values */ return val ? 0 : 1; } #define acpi_copy_property_array_uint(items, val, nval) \ ({ \ typeof(items) __items = items; \ typeof(val) __val = val; \ typeof(nval) __nval = nval; \ size_t i; \ int ret = 0; \ \ for (i = 0; i < __nval; i++) { \ if (__items->type == ACPI_TYPE_BUFFER) { \ __val[i] = __items->buffer.pointer[i]; \ continue; \ } \ if (__items[i].type != ACPI_TYPE_INTEGER) { \ ret = -EPROTO; \ break; \ } \ if (__items[i].integer.value > _Generic(__val, \ u8 *: U8_MAX, \ u16 *: U16_MAX, \ u32 *: U32_MAX, \ u64 *: U64_MAX)) { \ ret = -EOVERFLOW; \ break; \ } \ \ __val[i] = __items[i].integer.value; \ } \ ret; \ }) static int acpi_copy_property_array_string(const union acpi_object *items, char **val, size_t nval) { int i; for (i = 0; i < nval; i++) { if (items[i].type != ACPI_TYPE_STRING) return -EPROTO; val[i] = items[i].string.pointer; } return nval; } static int acpi_data_prop_read(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { const union acpi_object *obj; const union acpi_object *items; int ret; if (nval == 1 || !val) { ret = acpi_data_prop_read_single(data, propname, proptype, val); /* * The overflow error means that the property is there and it is * single-value, but its type does not match, so return. */ if (ret >= 0 || ret == -EOVERFLOW) return ret; /* * Reading this property as a single-value one failed, but its * value may still be represented as one-element array, so * continue. */ } ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj); if (ret && proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_BUFFER, &obj); if (ret) return ret; if (!val) { if (obj->type == ACPI_TYPE_BUFFER) return obj->buffer.length; return obj->package.count; } switch (proptype) { case DEV_PROP_STRING: break; default: if (obj->type == ACPI_TYPE_BUFFER) { if (nval > obj->buffer.length) return -EOVERFLOW; } else { if (nval > obj->package.count) return -EOVERFLOW; } break; } if (obj->type == ACPI_TYPE_BUFFER) { if (proptype != DEV_PROP_U8) return -EPROTO; items = obj; } else { items = obj->package.elements; } switch (proptype) { case DEV_PROP_U8: ret = acpi_copy_property_array_uint(items, (u8 *)val, nval); break; case DEV_PROP_U16: ret = acpi_copy_property_array_uint(items, (u16 *)val, nval); break; case DEV_PROP_U32: ret = acpi_copy_property_array_uint(items, (u32 *)val, nval); break; case DEV_PROP_U64: ret = acpi_copy_property_array_uint(items, (u64 *)val, nval); break; case DEV_PROP_STRING: nval = min_t(u32, nval, obj->package.count); if (nval == 0) return -ENODATA; ret = acpi_copy_property_array_string(items, (char **)val, nval); break; default: ret = -EINVAL; break; } return ret; } /** * acpi_node_prop_read - retrieve the value of an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @proptype: Expected property type. * @val: Location to store the property value (if not %NULL). * @nval: Size of the array pointed to by @val. * * If @val is %NULL, return the number of array elements comprising the value * of the property. Otherwise, read at most @nval values to the array at the * location pointed to by @val. */ static int acpi_node_prop_read(const struct fwnode_handle *fwnode, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { return acpi_data_prop_read(acpi_device_data_of_node(fwnode), propname, proptype, val, nval); } static int stop_on_next(struct acpi_device *adev, void *data) { struct acpi_device **ret_p = data; if (!*ret_p) { *ret_p = adev; return 1; } /* Skip until the "previous" object is found. */ if (*ret_p == adev) *ret_p = NULL; return 0; } /** * acpi_get_next_subnode - Return the next child node handle for a fwnode * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. */ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct acpi_device *adev = to_acpi_device_node(fwnode); if ((!child || is_acpi_device_node(child)) && adev) { struct acpi_device *child_adev = to_acpi_device_node(child); acpi_dev_for_each_child(adev, stop_on_next, &child_adev); if (child_adev) return acpi_fwnode_handle(child_adev); child = NULL; } if (!child || is_acpi_data_node(child)) { const struct acpi_data_node *data = to_acpi_data_node(fwnode); const struct list_head *head; struct list_head *next; struct acpi_data_node *dn; /* * We can have a combination of device and data nodes, e.g. with * hierarchical _DSD properties. Make sure the adev pointer is * restored before going through data nodes, otherwise we will * be looking for data_nodes below the last device found instead * of the common fwnode shared by device_nodes and data_nodes. */ adev = to_acpi_device_node(fwnode); if (adev) head = &adev->data.subnodes; else if (data) head = &data->data.subnodes; else return NULL; if (list_empty(head)) return NULL; if (child) { dn = to_acpi_data_node(child); next = dn->sibling.next; if (next == head) return NULL; dn = list_entry(next, struct acpi_data_node, sibling); } else { dn = list_first_entry(head, struct acpi_data_node, sibling); } return &dn->fwnode; } return NULL; } /** * acpi_node_get_parent - Return parent fwnode of this fwnode * @fwnode: Firmware node whose parent to get * * Returns parent node of an ACPI device or data firmware node or %NULL if * not available. */ static struct fwnode_handle * acpi_node_get_parent(const struct fwnode_handle *fwnode) { if (is_acpi_data_node(fwnode)) { /* All data nodes have parent pointer so just return that */ return to_acpi_data_node(fwnode)->parent; } if (is_acpi_device_node(fwnode)) { struct acpi_device *parent; parent = acpi_dev_parent(to_acpi_device_node(fwnode)); if (parent) return acpi_fwnode_handle(parent); } return NULL; } /* * Return true if the node is an ACPI graph node. Called on either ports * or endpoints. */ static bool is_acpi_graph_node(struct fwnode_handle *fwnode, const char *str) { unsigned int len = strlen(str); const char *name; if (!len || !is_acpi_data_node(fwnode)) return false; name = to_acpi_data_node(fwnode)->name; return (fwnode_property_present(fwnode, "reg") && !strncmp(name, str, len) && name[len] == '@') || fwnode_property_present(fwnode, str); } /** * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns * %NULL if there is no next endpoint or in case of error. In case of success * the next endpoint is returned. */ static struct fwnode_handle *acpi_graph_get_next_endpoint( const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *port = NULL; struct fwnode_handle *endpoint; if (!prev) { do { port = fwnode_get_next_child_node(fwnode, port); /* * The names of the port nodes begin with "port@" * followed by the number of the port node and they also * have a "reg" property that also has the number of the * port node. For compatibility reasons a node is also * recognised as a port node from the "port" property. */ if (is_acpi_graph_node(port, "port")) break; } while (port); } else { port = fwnode_get_parent(prev); } if (!port) return NULL; endpoint = fwnode_get_next_child_node(port, prev); while (!endpoint) { port = fwnode_get_next_child_node(fwnode, port); if (!port) break; if (is_acpi_graph_node(port, "port")) endpoint = fwnode_get_next_child_node(port, NULL); } /* * The names of the endpoint nodes begin with "endpoint@" followed by * the number of the endpoint node and they also have a "reg" property * that also has the number of the endpoint node. For compatibility * reasons a node is also recognised as an endpoint node from the * "endpoint" property. */ if (!is_acpi_graph_node(endpoint, "endpoint")) return NULL; return endpoint; } /** * acpi_graph_get_child_prop_value - Return a child with a given property value * @fwnode: device fwnode * @prop_name: The name of the property to look for * @val: the desired property value * * Return the port node corresponding to a given port number. Returns * the child node on success, NULL otherwise. */ static struct fwnode_handle *acpi_graph_get_child_prop_value( const struct fwnode_handle *fwnode, const char *prop_name, unsigned int val) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { u32 nr; if (fwnode_property_read_u32(child, prop_name, &nr)) continue; if (val == nr) return child; } return NULL; } /** * acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint * @__fwnode: Endpoint firmware node pointing to a remote device * * Returns the remote endpoint corresponding to @__fwnode. NULL on error. */ static struct fwnode_handle * acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode) { struct fwnode_handle *fwnode; unsigned int port_nr, endpoint_nr; struct fwnode_reference_args args; int ret; memset(&args, 0, sizeof(args)); ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0, &args); if (ret) return NULL; /* Direct endpoint reference? */ if (!is_acpi_device_node(args.fwnode)) return args.nargs ? NULL : args.fwnode; /* * Always require two arguments with the reference: port and * endpoint indices. */ if (args.nargs != 2) return NULL; fwnode = args.fwnode; port_nr = args.args[0]; endpoint_nr = args.args[1]; fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr); return acpi_graph_get_child_prop_value(fwnode, "endpoint", endpoint_nr); } static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (!is_acpi_device_node(fwnode)) return true; return acpi_device_is_present(to_acpi_device_node(fwnode)); } static const void * acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { return acpi_device_get_match_data(dev); } static bool acpi_fwnode_device_dma_supported(const struct fwnode_handle *fwnode) { return acpi_dma_supported(to_acpi_device_node(fwnode)); } static enum dev_dma_attr acpi_fwnode_device_get_dma_attr(const struct fwnode_handle *fwnode) { return acpi_get_dma_attr(to_acpi_device_node(fwnode)); } static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { return !acpi_node_prop_get(fwnode, propname, NULL); } static int acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { enum dev_prop_type type; switch (elem_size) { case sizeof(u8): type = DEV_PROP_U8; break; case sizeof(u16): type = DEV_PROP_U16; break; case sizeof(u32): type = DEV_PROP_U32; break; case sizeof(u64): type = DEV_PROP_U64; break; default: return -ENXIO; } return acpi_node_prop_read(fwnode, propname, type, val, nval); } static int acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING, val, nval); } static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int args_count, unsigned int index, struct fwnode_reference_args *args) { return __acpi_node_get_property_reference(fwnode, prop, index, args_count, args); } static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode) { const struct acpi_device *adev; struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return "\\"; fwnode_handle_put(parent); if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return dn->name; } adev = to_acpi_device_node(fwnode); if (WARN_ON(!adev)) return NULL; return acpi_device_bid(adev); } static const char * acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return ""; /* Is this 2nd node from the root? */ parent = fwnode_get_next_parent(parent); if (!parent) return ""; fwnode_handle_put(parent); /* ACPI device or data node. */ return "."; } static struct fwnode_handle * acpi_fwnode_get_parent(struct fwnode_handle *fwnode) { return acpi_node_get_parent(fwnode); } static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode); endpoint->local_fwnode = fwnode; if (fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port)) fwnode_property_read_u32(port_fwnode, "port", &endpoint->port); if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id)) fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id); return 0; } static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { struct resource res; int ret; ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res); if (ret) return ret; return res.start; } #define DECLARE_ACPI_FWNODE_OPS(ops) \ const struct fwnode_operations ops = { \ .device_is_available = acpi_fwnode_device_is_available, \ .device_get_match_data = acpi_fwnode_device_get_match_data, \ .device_dma_supported = \ acpi_fwnode_device_dma_supported, \ .device_get_dma_attr = acpi_fwnode_device_get_dma_attr, \ .property_present = acpi_fwnode_property_present, \ .property_read_bool = acpi_fwnode_property_present, \ .property_read_int_array = \ acpi_fwnode_property_read_int_array, \ .property_read_string_array = \ acpi_fwnode_property_read_string_array, \ .get_parent = acpi_node_get_parent, \ .get_next_child_node = acpi_get_next_subnode, \ .get_named_child_node = acpi_fwnode_get_named_child_node, \ .get_name = acpi_fwnode_get_name, \ .get_name_prefix = acpi_fwnode_get_name_prefix, \ .get_reference_args = acpi_fwnode_get_reference_args, \ .graph_get_next_endpoint = \ acpi_graph_get_next_endpoint, \ .graph_get_remote_endpoint = \ acpi_graph_get_remote_endpoint, \ .graph_get_port_parent = acpi_fwnode_get_parent, \ .graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \ .irq_get = acpi_fwnode_irq_get, \ }; \ EXPORT_SYMBOL_GPL(ops) DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops); const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_device_fwnode_ops; } EXPORT_SYMBOL(is_acpi_device_node); bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; } EXPORT_SYMBOL(is_acpi_data_node); |
7 7 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/kernel.h> #include <linux/gfp.h> #include <net/genetlink.h> #include <linux/nl802154.h> #include "ieee802154.h" static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); /* Requests to userspace */ struct sk_buff *ieee802154_nl_create(int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); unsigned long f; if (!msg) return NULL; spin_lock_irqsave(&ieee802154_seq_lock, f); hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, &nl802154_family, flags, req); spin_unlock_irqrestore(&ieee802154_seq_lock, f); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC); } struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return NULL; hdr = genlmsg_put_reply(msg, info, &nl802154_family, flags, req); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); } static const struct genl_small_ops ieee802154_ops[] = { /* see nl-phy.c */ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy, ieee802154_dump_phy), IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface), IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface), /* see nl-mac.c */ IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req), IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp), IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req), IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req), IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req), IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface, ieee802154_dump_iface), IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams), IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams), IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL, ieee802154_llsec_dump_keys), IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key), IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL, ieee802154_llsec_dump_devs), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL, ieee802154_llsec_dump_devkeys), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL, ieee802154_llsec_dump_seclevels), IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL, ieee802154_llsec_add_seclevel), IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL, ieee802154_llsec_del_seclevel), }; static const struct genl_multicast_group ieee802154_mcgrps[] = { [IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, }, [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, }; struct genl_family nl802154_family __ro_after_init = { .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, .maxattr = IEEE802154_ATTR_MAX, .policy = ieee802154_policy, .module = THIS_MODULE, .small_ops = ieee802154_ops, .n_small_ops = ARRAY_SIZE(ieee802154_ops), .resv_start_op = IEEE802154_LLSEC_DEL_SECLEVEL + 1, .mcgrps = ieee802154_mcgrps, .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps), }; int __init ieee802154_nl_init(void) { return genl_register_family(&nl802154_family); } void ieee802154_nl_exit(void) { genl_unregister_family(&nl802154_family); } |
8 8 4 6 2 2 4 4 4 2 2 2 2 2 4 4 4 2 4 4 31 2 17 19 10 14 5 3 2 3 1 1 9 6 1 2 7 1 1 8 3 3 2 3 2 4 2 1 1 5 5 3 2 2 2 2 1 2 3 1 2 2 2 2 2 2 2 2 2 2 2 8 1 4 1 3 3 3 2 1 2 2 2 2 6 2 1 1 3 3 1 1 1 2 28 1 14 15 4 2 2 1 3 1 2 2 11 1 1 3 5 1 7 2 1 1 1 1 2 3 2 2 3 2 3 1 1 3 1 2 2 2 2 1 1 1 1 2 3 2 2 3 3 2 3 2 1 2 1 2 33 1 4 1 6 6 4 5 6 1 1 8 1 1 7 2 3 1 5 5 1 6 1 2 3 5 5 5 1 2 4 1 4 1 1 3 1 2 7 1 6 2 4 1 2 2 8 1 4 7 2 1 1 6 1 1 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 - wext compat code * * This is temporary code until all wireless functionality is migrated * into cfg80211, when that happens all the exports here go away and * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019-2023 Intel Corporation */ #include <linux/export.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "core.h" #include "rdev-ops.h" int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { strcpy(wrqu->name, "IEEE 802.11"); return 0; } int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; rdev = wiphy_to_rdev(wdev->wiphy); switch (*mode) { case IW_MODE_INFRA: type = NL80211_IFTYPE_STATION; break; case IW_MODE_ADHOC: type = NL80211_IFTYPE_ADHOC; break; case IW_MODE_MONITOR: type = NL80211_IFTYPE_MONITOR; break; default: return -EINVAL; } if (type == wdev->iftype) return 0; memset(&vifparams, 0, sizeof(vifparams)); guard(wiphy)(wdev->wiphy); return cfg80211_change_iface(rdev, dev, type, &vifparams); } int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; switch (wdev->iftype) { case NL80211_IFTYPE_AP: *mode = IW_MODE_MASTER; break; case NL80211_IFTYPE_STATION: *mode = IW_MODE_INFRA; break; case NL80211_IFTYPE_ADHOC: *mode = IW_MODE_ADHOC; break; case NL80211_IFTYPE_MONITOR: *mode = IW_MODE_MONITOR; break; case NL80211_IFTYPE_WDS: *mode = IW_MODE_REPEAT; break; case NL80211_IFTYPE_AP_VLAN: *mode = IW_MODE_SECOND; /* FIXME */ break; default: *mode = IW_MODE_AUTO; break; } return 0; } int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; enum nl80211_band band; int i, c = 0; if (!wdev) return -EOPNOTSUPP; data->length = sizeof(struct iw_range); memset(range, 0, sizeof(struct iw_range)); range->we_version_compiled = WIRELESS_EXT; range->we_version_source = 21; range->retry_capa = IW_RETRY_LIMIT; range->retry_flags = IW_RETRY_LIMIT; range->min_retry = 0; range->max_retry = 255; range->min_rts = 0; range->max_rts = 2347; range->min_frag = 256; range->max_frag = 2346; range->max_encoding_tokens = 4; range->max_qual.updated = IW_QUAL_NOISE_INVALID; switch (wdev->wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_NONE: break; case CFG80211_SIGNAL_TYPE_MBM: range->max_qual.level = (u8)-110; range->max_qual.qual = 70; range->avg_qual.qual = 35; range->max_qual.updated |= IW_QUAL_DBM; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; case CFG80211_SIGNAL_TYPE_UNSPEC: range->max_qual.level = 100; range->max_qual.qual = 100; range->avg_qual.qual = 50; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; } range->avg_qual.level = range->max_qual.level / 2; range->avg_qual.noise = range->max_qual.noise / 2; range->avg_qual.updated = range->max_qual.updated; for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) { switch (wdev->wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_TKIP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_WPA); break; case WLAN_CIPHER_SUITE_CCMP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP | IW_ENC_CAPA_WPA2); break; case WLAN_CIPHER_SUITE_WEP40: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP40; break; case WLAN_CIPHER_SUITE_WEP104: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP104; break; } } for (band = 0; band < NUM_NL80211_BANDS; band ++) { struct ieee80211_supported_band *sband; sband = wdev->wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { range->freq[c].i = ieee80211_frequency_to_channel( chan->center_freq); range->freq[c].m = chan->center_freq; range->freq[c].e = 6; c++; } } } range->num_channels = c; range->num_frequency = c; IW_EVENT_CAPA_SET_KERNEL(range->event_capa); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); if (wdev->wiphy->max_scan_ssids > 0) range->scan_capa |= IW_SCAN_CAPA_ESSID; return 0; } /** * cfg80211_wext_freq - get wext frequency for non-"auto" * @freq: the wext freq encoding * * Returns: a frequency, or a negative error code, or 0 for auto. */ int cfg80211_wext_freq(struct iw_freq *freq) { /* * Parse frequency - return 0 for auto and * -EINVAL for impossible things. */ if (freq->e == 0) { enum nl80211_band band = NL80211_BAND_2GHZ; if (freq->m < 0) return 0; if (freq->m > 14) band = NL80211_BAND_5GHZ; return ieee80211_channel_to_frequency(freq->m, band); } else { int i, div = 1000000; for (i = 0; i < freq->e; i++) div /= 10; if (div <= 0) return -EINVAL; return freq->m / div; } } int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 orts = wdev->wiphy->rts_threshold; int err; guard(wiphy)(&rdev->wiphy); if (rts->disabled || !rts->fixed) wdev->wiphy->rts_threshold = (u32) -1; else if (rts->value < 0) return -EINVAL; else wdev->wiphy->rts_threshold = rts->value; err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; return err; } int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; rts->value = wdev->wiphy->rts_threshold; rts->disabled = rts->value == (u32) -1; rts->fixed = 1; return 0; } int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 ofrag = wdev->wiphy->frag_threshold; int err; guard(wiphy)(&rdev->wiphy); if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; } else if (frag->value < 256) { return -EINVAL; } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; return err; } int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; frag->value = wdev->wiphy->frag_threshold; frag->disabled = frag->value == (u32) -1; frag->fixed = 1; return 0; } static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 changed = 0; u8 olong = wdev->wiphy->retry_long; u8 oshort = wdev->wiphy->retry_short; int err; if (retry->disabled || retry->value < 1 || retry->value > 255 || (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; guard(wiphy)(&rdev->wiphy); if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; } else if (retry->flags & IW_RETRY_SHORT) { wdev->wiphy->retry_short = retry->value; changed |= WIPHY_PARAM_RETRY_SHORT; } else { wdev->wiphy->retry_short = retry->value; wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; changed |= WIPHY_PARAM_RETRY_SHORT; } err = rdev_set_wiphy_params(rdev, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } return err; } int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; retry->disabled = 0; if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) { /* * First return short value, iwconfig will ask long value * later if needed */ retry->flags |= IW_RETRY_LIMIT | IW_RETRY_SHORT; retry->value = wdev->wiphy->retry_short; if (wdev->wiphy->retry_long == wdev->wiphy->retry_short) retry->flags |= IW_RETRY_LONG; return 0; } if (retry->flags & IW_RETRY_LONG) { retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG; retry->value = wdev->wiphy->retry_long; } return 0; } static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, struct net_device *dev, bool pairwise, const u8 *addr, bool remove, bool tx_key, int idx, struct key_params *params) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err, i; bool rejoin = false; if (wdev->valid_links) return -EINVAL; if (pairwise && !addr) return -EINVAL; /* * In many cases we won't actually need this, but it's better * to do it first in case the allocation fails. Don't use wext. */ if (!wdev->wext.keys) { wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; for (i = 0; i < 4; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } if (wdev->iftype != NL80211_IFTYPE_ADHOC && wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (!wdev->connected) return -ENOLINK; if (!rdev->ops->set_default_mgmt_key) return -EOPNOTSUPP; if (idx < 4 || idx > 5) return -EINVAL; } else if (idx < 0 || idx > 3) return -EINVAL; if (remove) { err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If removing the current TX key, we will need to * join a new IBSS without the privacy bit clear. */ if (idx == wdev->wext.default_key && wdev->iftype == NL80211_IFTYPE_ADHOC) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } if (!pairwise && addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; else err = rdev_del_key(rdev, dev, -1, idx, pairwise, addr); } wdev->wext.connect.privacy = false; /* * Applications using wireless extensions expect to be * able to delete keys that don't exist, so allow that. */ if (err == -ENOENT) err = 0; if (!err) { if (!addr && idx < 4) { memset(wdev->wext.keys->data[idx], 0, sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; wdev->wext.keys->params[idx].cipher = 0; } if (idx == wdev->wext.default_key) wdev->wext.default_key = -1; else if (idx == wdev->wext.default_mgmt_key) wdev->wext.default_mgmt_key = -1; } if (!err && rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); return err; } if (addr) tx_key = false; if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr)) return -EINVAL; err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params); else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; if (err) return err; /* * We only need to store WEP keys, since they're the only keys that * can be set before a connection is established and persist after * disconnecting. */ if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104)) { wdev->wext.keys->params[idx] = *params; memcpy(wdev->wext.keys->data[idx], params->key, params->key_len); wdev->wext.keys->params[idx].key = wdev->wext.keys->data[idx]; } if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104) && (tx_key || (!addr && wdev->wext.default_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If we are getting a new TX key from not having * had one before we need to join a new IBSS with * the privacy bit set. */ if (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->wext.default_key == -1) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } err = rdev_set_default_key(rdev, dev, -1, idx, true, true); } if (!err) { wdev->wext.default_key = idx; if (rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); } return err; } if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC && (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_mgmt_key(rdev, dev, -1, idx); if (!err) wdev->wext.default_mgmt_key = idx; return err; } return 0; } static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct key_params params; bool remove = false; int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; guard(wiphy)(&rdev->wiphy); if (wdev->valid_links) return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) { return -EINVAL; } else { idx--; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; else if (erq->length == 0) { /* No key data - just set the default TX key index */ int err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_key(rdev, dev, -1, idx, true, true); if (!err) wdev->wext.default_key = idx; return err; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; if (erq->length == 5) params.cipher = WLAN_CIPHER_SUITE_WEP40; else if (erq->length == 13) params.cipher = WLAN_CIPHER_SUITE_WEP104; else if (!remove) return -EINVAL; return cfg80211_set_encryption(rdev, dev, false, NULL, remove, wdev->wext.default_key == -1, idx, ¶ms); } static int cfg80211_wext_siwencodeext(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; const u8 *addr; int idx; bool remove = false; struct key_params params; u32 cipher; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; if (wdev->valid_links) return -EOPNOTSUPP; switch (ext->alg) { case IW_ENCODE_ALG_NONE: remove = true; cipher = 0; break; case IW_ENCODE_ALG_WEP: if (ext->key_len == 5) cipher = WLAN_CIPHER_SUITE_WEP40; else if (ext->key_len == 13) cipher = WLAN_CIPHER_SUITE_WEP104; else return -EINVAL; break; case IW_ENCODE_ALG_TKIP: cipher = WLAN_CIPHER_SUITE_TKIP; break; case IW_ENCODE_ALG_CCMP: cipher = WLAN_CIPHER_SUITE_CCMP; break; case IW_ENCODE_ALG_AES_CMAC: cipher = WLAN_CIPHER_SUITE_AES_CMAC; break; default: return -EOPNOTSUPP; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; idx = erq->flags & IW_ENCODE_INDEX; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (idx < 4 || idx > 5) { idx = wdev->wext.default_mgmt_key; if (idx < 0) return -EINVAL; } else idx--; } else { if (idx < 1 || idx > 4) { idx = wdev->wext.default_key; if (idx < 0) return -EINVAL; } else idx--; } addr = ext->addr.sa_data; if (is_broadcast_ether_addr(addr)) addr = NULL; memset(¶ms, 0, sizeof(params)); params.key = ext->key; params.key_len = ext->key_len; params.cipher = cipher; if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) { params.seq = ext->rx_seq; params.seq_len = 6; } guard(wiphy)(wdev->wiphy); return cfg80211_set_encryption(rdev, dev, !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), addr, remove, ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, idx, ¶ms); } static int cfg80211_wext_giwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) return -EINVAL; else idx--; erq->flags = idx + 1; if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) { erq->flags |= IW_ENCODE_DISABLED; erq->length = 0; return 0; } erq->length = min_t(size_t, erq->length, wdev->wext.keys->params[idx].key_len); memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length); erq->flags |= IW_ENCODE_ENABLED; return 0; } static int cfg80211_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *wextfreq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; int freq; guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq == 0) return -EINVAL; chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) return -EINVAL; return cfg80211_set_monitor_channel(rdev, dev, &chandef); case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq == 0) return -EINVAL; chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) return -EINVAL; return cfg80211_set_mesh_channel(rdev, wdev, &chandef); default: return -EOPNOTSUPP; } } static int cfg80211_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *freq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = {}; int ret; guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_MONITOR: if (!rdev->ops->get_channel) return -EINVAL; ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) return ret; freq->m = chandef.chan->center_freq; freq->e = 6; return ret; default: return -EINVAL; } } static int cfg80211_wext_siwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->set_tx_power) return -EOPNOTSUPP; /* only change when not disabling */ if (!data->txpower.disabled) { rfkill_set_sw_state(rdev->wiphy.rfkill, false); if (data->txpower.fixed) { /* * wext doesn't support negative values, see * below where it's for automatic */ if (data->txpower.value < 0) return -EINVAL; dbm = data->txpower.value; type = NL80211_TX_POWER_FIXED; /* TODO: do regulatory check! */ } else { /* * Automatic power level setting, max being the value * passed in from userland. */ if (data->txpower.value < 0) { type = NL80211_TX_POWER_AUTOMATIC; } else { dbm = data->txpower.value; type = NL80211_TX_POWER_LIMITED; } } } else { if (rfkill_set_sw_state(rdev->wiphy.rfkill, true)) schedule_work(&rdev->rfkill_block); return 0; } guard(wiphy)(&rdev->wiphy); return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err, val; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; scoped_guard(wiphy, &rdev->wiphy) { err = rdev_get_tx_power(rdev, wdev, 0, &val); } if (err) return err; /* well... oh well */ data->txpower.fixed = 1; data->txpower.disabled = rfkill_blocked(rdev->wiphy.rfkill); data->txpower.value = val; data->txpower.flags = IW_TXPOW_DBM; return 0; } static int cfg80211_set_auth_alg(struct wireless_dev *wdev, s32 auth_alg) { int nr_alg = 0; if (!auth_alg) return -EINVAL; if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM | IW_AUTH_ALG_SHARED_KEY | IW_AUTH_ALG_LEAP)) return -EINVAL; if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } if (auth_alg & IW_AUTH_ALG_SHARED_KEY) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY; } if (auth_alg & IW_AUTH_ALG_LEAP) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; } if (nr_alg > 1) wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; return 0; } static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions) { if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA | IW_AUTH_WPA_VERSION_WPA2| IW_AUTH_WPA_VERSION_DISABLED)) return -EINVAL; if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) && (wpa_versions & (IW_AUTH_WPA_VERSION_WPA| IW_AUTH_WPA_VERSION_WPA2))) return -EINVAL; if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) wdev->wext.connect.crypto.wpa_versions &= ~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2); if (wpa_versions & IW_AUTH_WPA_VERSION_WPA) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_1; if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_2; return 0; } static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher) { if (cipher & IW_AUTH_CIPHER_WEP40) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP40; else if (cipher & IW_AUTH_CIPHER_WEP104) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP104; else if (cipher & IW_AUTH_CIPHER_TKIP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_TKIP; else if (cipher & IW_AUTH_CIPHER_CCMP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_CCMP; else if (cipher & IW_AUTH_CIPHER_AES_CMAC) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_AES_CMAC; else if (cipher & IW_AUTH_CIPHER_NONE) wdev->wext.connect.crypto.cipher_group = 0; else return -EINVAL; return 0; } static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher) { int nr_ciphers = 0; u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise; if (cipher & IW_AUTH_CIPHER_WEP40) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_WEP104) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_TKIP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_CCMP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_AES_CMAC) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC; nr_ciphers++; } BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5); wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers; return 0; } static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt) { int nr_akm_suites = 0; if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X | IW_AUTH_KEY_MGMT_PSK)) return -EINVAL; if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_8021X; nr_akm_suites++; } if (key_mgt & IW_AUTH_KEY_MGMT_PSK) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_PSK; nr_akm_suites++; } wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites; return 0; } static int cfg80211_wext_siwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *data = &wrqu->param; struct wireless_dev *wdev = dev->ieee80211_ptr; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_PRIVACY_INVOKED: wdev->wext.connect.privacy = data->value; return 0; case IW_AUTH_WPA_VERSION: return cfg80211_set_wpa_version(wdev, data->value); case IW_AUTH_CIPHER_GROUP: return cfg80211_set_cipher_group(wdev, data->value); case IW_AUTH_KEY_MGMT: return cfg80211_set_key_mgt(wdev, data->value); case IW_AUTH_CIPHER_PAIRWISE: return cfg80211_set_cipher_pairwise(wdev, data->value); case IW_AUTH_80211_AUTH_ALG: return cfg80211_set_auth_alg(wdev, data->value); case IW_AUTH_WPA_ENABLED: case IW_AUTH_RX_UNENCRYPTED_EAPOL: case IW_AUTH_DROP_UNENCRYPTED: case IW_AUTH_MFP: return 0; default: return -EOPNOTSUPP; } } static int cfg80211_wext_giwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { /* XXX: what do we need? */ return -EOPNOTSUPP; } static int cfg80211_wext_siwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); bool ps; int timeout = wdev->ps_timeout; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (!rdev->ops->set_power_mgmt) return -EOPNOTSUPP; if (wrq->disabled) { ps = false; } else { switch (wrq->flags & IW_POWER_MODE) { case IW_POWER_ON: /* If not specified */ case IW_POWER_MODE: /* If set all mask */ case IW_POWER_ALL_R: /* If explicitly state all */ ps = true; break; default: /* Otherwise we ignore */ return -EINVAL; } if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT)) return -EINVAL; if (wrq->flags & IW_POWER_TIMEOUT) timeout = wrq->value / 1000; } guard(wiphy)(&rdev->wiphy); err = rdev_set_power_mgmt(rdev, dev, ps, timeout); if (err) return err; wdev->ps = ps; wdev->ps_timeout = timeout; return 0; } static int cfg80211_wext_giwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; wrq->disabled = !wdev->ps; return 0; } static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; bool match = false; int band, ridx; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; memset(&mask, 0, sizeof(mask)); fixed = 0; maxrate = (u32)-1; if (rate->value < 0) { /* nothing */ } else if (rate->fixed) { fixed = rate->value / 100000; } else { maxrate = rate->value / 100000; } for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wdev->wiphy->bands[band]; if (sband == NULL) continue; for (ridx = 0; ridx < sband->n_bitrates; ridx++) { struct ieee80211_rate *srate = &sband->bitrates[ridx]; if (fixed == srate->bitrate) { mask.control[band].legacy = 1 << ridx; match = true; break; } if (srate->bitrate <= maxrate) { mask.control[band].legacy |= 1 << ridx; match = true; } } } if (!match) return -EINVAL; guard(wiphy)(&rdev->wiphy); if (dev->ieee80211_ptr->valid_links) return -EOPNOTSUPP; return rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); } static int cfg80211_wext_giwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!rdev->ops->get_station) return -EOPNOTSUPP; err = 0; if (!wdev->valid_links && wdev->links[0].client.current_bss) memcpy(addr, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else err = -EOPNOTSUPP; if (err) return err; scoped_guard(wiphy, &rdev->wiphy) { err = rdev_get_station(rdev, dev, addr, &sinfo); } if (err) return err; if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { err = -EOPNOTSUPP; goto free; } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); free: cfg80211_sinfo_release_content(&sinfo); return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; int ret; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) return NULL; if (!rdev->ops->get_station) return NULL; /* Grab BSSID of current BSS, if any */ wiphy_lock(&rdev->wiphy); if (wdev->valid_links || !wdev->links[0].client.current_bss) { wiphy_unlock(&rdev->wiphy); return NULL; } memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); memset(&sinfo, 0, sizeof(sinfo)); ret = rdev_get_station(rdev, dev, bssid, &sinfo); wiphy_unlock(&rdev->wiphy); if (ret) return NULL; memset(&wstats, 0, sizeof(wstats)); switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.updated |= IW_QUAL_DBM; wstats.qual.level = sig; if (sig < -110) sig = -110; else if (sig > -40) sig = -40; wstats.qual.qual = sig + 110; break; } fallthrough; case CFG80211_SIGNAL_TYPE_UNSPEC: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; wstats.qual.qual = sinfo.signal; break; } fallthrough; default: wstats.qual.updated |= IW_QUAL_LEVEL_INVALID; wstats.qual.updated |= IW_QUAL_QUAL_INVALID; } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; cfg80211_sinfo_release_content(&sinfo); return &wstats; } static int cfg80211_wext_siwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); default: return -EOPNOTSUPP; } } static int cfg80211_wext_giwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); default: return -EOPNOTSUPP; } } static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *ssid) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_r |