Total coverage: 153623 (9%)of 1917583
53 53 41 47 47 47 47 10 28 26 15 7 7 12 12 37 38 8 27 11 34 67 67 67 1 51 50 93 90 1 1 1 7 5 2 64 48 78 78 9 73 7 72 1 62 20 14 1 1 3 66 1 36 15 15 53 38 38 41 4 4 1 3 3 1 3 6 1 1 4 1 3 1 1 20 20 10 10 14 5 2 7 1 1 83 11 43 43 2 6 42 6 38 3 8 6 3 8 7 3 9 3 6 11 27 8 24 106 1 106 1 6 20 85 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(sk, sfl) \ for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc_obj(*fl); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { int room = FL_MAX_SIZE - atomic_read(&fl_size); struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { struct inet_sock *inet = inet_sk(sk); spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = inet->ipv6_fl_list; rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; inet6_set_bit(REPFLOW, sk); return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc_obj(*sfl1); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(sk, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); }
35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel NETLINK Interface * * This file defines the NETLINK interface for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 */ #ifndef _NETLABEL_USER_H #define _NETLABEL_USER_H #include <linux/types.h> #include <linux/skbuff.h> #include <linux/capability.h> #include <linux/audit.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> /* NetLabel NETLINK helper functions */ /** * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg * @audit_info: NetLabel audit information */ static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { security_current_getlsmprop_subj(&audit_info->prop); audit_info->loginuid = audit_get_loginuid(current); audit_info->sessionid = audit_get_sessionid(current); } /* NetLabel NETLINK I/O functions */ int netlbl_netlink_init(void); /* NetLabel Audit Functions */ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info); #endif
9 14 10 5 7 10 8 6 13 4 3 3 4 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 // SPDX-License-Identifier: GPL-2.0-only /* * Cryptographic API * * Michael MIC (IEEE 802.11i/TKIP) keyed digest * * Copyright (c) 2004 Jouni Malinen <j@w1.fi> */ #include <crypto/internal/hash.h> #include <linux/unaligned.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> struct michael_mic_ctx { u32 l, r; }; struct michael_mic_desc_ctx { __le32 pending; size_t pending_len; u32 l, r; }; static inline u32 xswap(u32 val) { return ((val & 0x00ff00ff) << 8) | ((val & 0xff00ff00) >> 8); } #define michael_block(l, r) \ do { \ r ^= rol32(l, 17); \ l += r; \ r ^= xswap(l); \ l += r; \ r ^= rol32(l, 3); \ l += r; \ r ^= ror32(l, 2); \ l += r; \ } while (0) static int michael_init(struct shash_desc *desc) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); struct michael_mic_ctx *ctx = crypto_shash_ctx(desc->tfm); mctx->pending_len = 0; mctx->l = ctx->l; mctx->r = ctx->r; return 0; } static int michael_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); if (mctx->pending_len) { int flen = 4 - mctx->pending_len; if (flen > len) flen = len; memcpy((u8 *)&mctx->pending + mctx->pending_len, data, flen); mctx->pending_len += flen; data += flen; len -= flen; if (mctx->pending_len < 4) return 0; mctx->l ^= le32_to_cpu(mctx->pending); michael_block(mctx->l, mctx->r); mctx->pending_len = 0; } while (len >= 4) { mctx->l ^= get_unaligned_le32(data); michael_block(mctx->l, mctx->r); data += 4; len -= 4; } if (len > 0) { mctx->pending_len = len; memcpy(&mctx->pending, data, len); } return 0; } static int michael_final(struct shash_desc *desc, u8 *out) { struct michael_mic_desc_ctx *mctx = shash_desc_ctx(desc); u8 *data = (u8 *)&mctx->pending; /* Last block and padding (0x5a, 4..7 x 0) */ switch (mctx->pending_len) { case 0: mctx->l ^= 0x5a; break; case 1: mctx->l ^= data[0] | 0x5a00; break; case 2: mctx->l ^= data[0] | (data[1] << 8) | 0x5a0000; break; case 3: mctx->l ^= data[0] | (data[1] << 8) | (data[2] << 16) | 0x5a000000; break; } michael_block(mctx->l, mctx->r); /* l ^= 0; */ michael_block(mctx->l, mctx->r); put_unaligned_le32(mctx->l, out); put_unaligned_le32(mctx->r, out + 4); return 0; } static int michael_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct michael_mic_ctx *mctx = crypto_shash_ctx(tfm); if (keylen != 8) return -EINVAL; mctx->l = get_unaligned_le32(key); mctx->r = get_unaligned_le32(key + 4); return 0; } static struct shash_alg alg = { .digestsize = 8, .setkey = michael_setkey, .init = michael_init, .update = michael_update, .final = michael_final, .descsize = sizeof(struct michael_mic_desc_ctx), .base = { .cra_name = "michael_mic", .cra_driver_name = "michael_mic-generic", .cra_blocksize = 8, .cra_ctxsize = sizeof(struct michael_mic_ctx), .cra_module = THIS_MODULE, } }; static int __init michael_mic_init(void) { return crypto_register_shash(&alg); } static void __exit michael_mic_exit(void) { crypto_unregister_shash(&alg); } module_init(michael_mic_init); module_exit(michael_mic_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Michael MIC"); MODULE_AUTHOR("Jouni Malinen <j@w1.fi>"); MODULE_ALIAS_CRYPTO("michael_mic");
1 1 1 1 3 2 3 1 1 9 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2004, Instant802 Networks, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/types.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wme.h" /* Default mapping in classifier to work with default * queue setup. */ const int ieee802_1d_to_ac[8] = { IEEE80211_AC_BE, IEEE80211_AC_BK, IEEE80211_AC_BK, IEEE80211_AC_BE, IEEE80211_AC_VI, IEEE80211_AC_VI, IEEE80211_AC_VO, IEEE80211_AC_VO }; static int wme_downgrade_ac(struct sk_buff *skb) { switch (skb->priority) { case 6: case 7: skb->priority = 5; /* VO -> VI */ return 0; case 4: case 5: skb->priority = 3; /* VI -> BE */ return 0; case 0: case 3: skb->priority = 2; /* BE -> BK */ return 0; default: return -1; } } /** * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved * @tid: the assumed-reserved TID * * Returns: the alternative TID to use, or 0 on error */ static inline u8 ieee80211_fix_reserved_tid(u8 tid) { switch (tid) { case 0: return 3; case 1: return 2; case 2: return 1; case 3: return 0; case 4: return 5; case 5: return 4; case 6: return 7; case 7: return 6; } return 0; } static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* in case we are a client verify acm is not set for this ac */ while (sdata->wmm_acm & BIT(skb->priority)) { int ac = ieee802_1d_to_ac[skb->priority]; if (ifmgd->tx_tspec[ac].admitted_time && skb->priority == ifmgd->tx_tspec[ac].up) return ac; if (wme_downgrade_ac(skb)) { /* * This should not really happen. The AP has marked all * lower ACs to require admission control which is not * a reasonable configuration. Allow the frame to be * transmitted using AC_BK as a workaround. */ break; } } /* Check to see if this is a reserved TID */ if (sta && sta->reserved_tid == skb->priority) skb->priority = ieee80211_fix_reserved_tid(skb->priority); /* look up which queue to use for frames with this 1d tag */ return ieee802_1d_to_ac[skb->priority]; } /* Indicate which queue to use for this fully formed 802.11 frame */ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { skb->priority = 7; return ieee802_1d_to_ac[skb->priority]; } if (!ieee80211_is_data_qos(hdr->frame_control)) { skb->priority = 0; return ieee802_1d_to_ac[skb->priority]; } p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; return ieee80211_downgrade_queue(sdata, NULL, skb); } u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); /* all mesh/ocb stations are required to support WME */ if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && !is_multicast_ether_addr(eth->h_dest)) || (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; else qos = false; if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } if (skb->protocol == sdata->control_port_protocol) { skb->priority = 7; goto downgrade; } /* use the data classifier to determine what 802.1d tag the * data frame has */ qos_map = rcu_dereference(sdata->qos_map); skb->priority = cfg80211_classify8021d(skb, qos_map ? &qos_map->qos_map : NULL); downgrade: return ieee80211_downgrade_queue(sdata, sta, skb); } /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * * @sdata: local subif * @skb: packet to be updated */ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; u8 flags; u8 *p; if (!ieee80211_is_data_qos(hdr->frame_control)) return; p = ieee80211_get_qos_ctl(hdr); /* don't overwrite the QoS field of injected frames */ if (info->flags & IEEE80211_TX_CTL_INJECTED) { /* do take into account Ack policy of injected frames */ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; return; } /* set up the first byte */ /* * preserve everything but the TID and ACK policy * (which we both write here) */ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_ACK_POLICY_MASK); if (is_multicast_ether_addr(hdr->addr1) || sdata->noack_map & BIT(tid)) { flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; info->flags |= IEEE80211_TX_CTL_NO_ACK; } *p = flags | tid; /* set up the second byte */ p++; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* preserve RSPI and Mesh PS Level bit */ *p &= ((IEEE80211_QOS_CTL_RSPI | IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); /* Nulls don't have a mesh header (frame body) */ if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); } else { *p = 0; } }
75 75 66 75 7 46 74 75 74 75 15 30 74 39 38 29 6 16 66 65 65 5 5 5 5 61 66 60 5 5 5 75 2 1 11 6 5 11 11 11 7 7 7 4 3 10 4 5 1 1 3 3 3 3 2 2 6 6 6 5 2 3 3 6 2 2 2 2 2 2 2 2 2 2 2 3 3 28 28 28 20 17 28 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ipv6.h> #include <net/inet6_hashtables.h> #include <net/addrconf.h> #include "rds.h" #include "loop.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) /* converting this to RCU is a chore for another day.. */ static DEFINE_SPINLOCK(rds_conn_lock); static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, const struct in6_addr *faddr) { static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; __be32 lhash, fhash; u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } #define rds_conn_info_set(var, test, suffix) do { \ if (test) \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; break; } } rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } /* * This is called by transports as they're bringing down a connection. * It clears partial message state so that the transport can start sending * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; rdsdebug("connection %pI6c to %pI6c reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all * of them to the application. That is not consistent with the * reliability guarantees of RDS. */ } static void __rds_conn_path_init(struct rds_connection *conn, struct rds_conn_path *cp, bool is_outgoing) { spin_lock_init(&cp->cp_lock); cp->cp_next_tx_seq = 1; init_waitqueue_head(&cp->cp_waitq); INIT_LIST_HEAD(&cp->cp_send_queue); INIT_LIST_HEAD(&cp->cp_retrans); cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); mutex_init(&cp->cp_cm_lock); cp->cp_flags = 0; } /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so * span the lifetime of the actual underlying transport connections. * * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; struct rds_conn_path *free_cp = NULL; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && ipv6_addr_equal(laddr, faddr) && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we * can stick the other QP. */ parent = conn; conn = parent->c_passive; } rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } conn->c_path = kzalloc_objs(struct rds_conn_path, npaths, gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); goto out; } INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = *laddr; conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used * to look up a socket when a packet is received */ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else #endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } /* * This is where a connection becomes loopback. If *any* RDS sockets * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; if (trans->t_prefer_loopback) { if (likely(is_outgoing)) { /* "outgoing" connection to local address. * Protocol says it wants the connection * handled by the loopback transport. * This is what TCP does. */ trans = &rds_loop_transport; } else { /* No transport currently in use * should end up here, but if it * does, reset/destroy the connection. */ kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-EOPNOTSUPP); goto out; } } } conn->c_trans = trans; init_waitqueue_head(&conn->c_hs_waitq); for (i = 0; i < npaths; i++) { __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; conn->c_path[i].cp_wq = alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0, rds_conn_count, i); if (!conn->c_path[i].cp_wq) conn->c_path[i].cp_wq = rds_wq; } rcu_read_lock(); if (rds_destroy_pending(conn)) ret = -ENETDOWN; else ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", conn, laddr, faddr, strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could * have created the same conn (either normal or passive) in the * interim. We check while holding the lock. If we won, we complete * init and return our conn. If we lost, we rollback and return the * other one. */ spin_lock_irqsave(&rds_conn_lock, flags); if (parent) { /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { parent->c_passive = conn; rds_cong_add_conn(conn); rds_conn_count++; } } else { /* Creating normal conn */ struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { struct rds_conn_path *cp; int i; for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; /* The ->conn_alloc invocation may have * allocated resource for all paths, so all * of them may have to be freed here. */ if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = found; } else { conn->c_my_gen_num = rds_gen_num; conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } } spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock(); out: if (free_cp) { for (i = 0; i < npaths; i++) if (free_cp[i].cp_wq != rds_wq) destroy_workqueue(free_cp[i].cp_wq); kfree(free_cp); } return conn; } struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_shutdown(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire * duration of the shutdown operation, else we may be * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ mutex_lock(&cp->cp_cm_lock); if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_RESETTING, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", atomic_read(&cp->cp_state)); mutex_unlock(&cp->cp_cm_lock); return; } mutex_unlock(&cp->cp_cm_lock); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproducible with loopback connections. * Mostly harmless. * * Note that this also happens with rds-tcp because * we could have triggered rds_conn_path_drop in irq * mode from rds_tcp_state change on the receipt of * a FIN, thus we need to recheck for RDS_CONN_ERROR * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " "is %d\n", __func__, atomic_read(&cp->cp_state)); return; } } /* Then reconnect if it's still live. * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); if (conn->c_trans->t_mp_capable && cp->cp_index == 0) rds_send_ping(conn, 0); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } /* we do not hold the socket lock here but it is safe because * fan-out is disabled when calling conn_slots_available() */ if (conn->c_trans->conn_slots_available) conn->c_trans->conn_slots_available(conn, false); } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over * all paths using rds_conn_path_destroy() */ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); rds_conn_path_drop(cp, true); flush_work(&cp->cp_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, m_conn_item) { list_del_init(&rm->m_conn_item); BUG_ON(!list_empty(&rm->m_sock_item)); rds_message_put(rm); } if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); WARN_ON(delayed_work_pending(&cp->cp_send_w)); WARN_ON(delayed_work_pending(&cp->cp_recv_w)); WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); if (cp->cp_wq != rds_wq) { destroy_workqueue(cp->cp_wq); cp->cp_wq = NULL; } cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } /* * Stop and free a connection. * * This can only be used in very limited circumstances. It assumes that once * the conn has been shutdown that no one else is referencing the connection. * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; int i; struct rds_conn_path *cp; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); synchronize_rcu(); /* shut the connection down */ for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_destroy(cp); BUG_ON(!list_empty(&cp->cp_retrans)); } /* * The congestion maps aren't freed up here. They're * freed by rds_cong_exit() after all the connections * have been freed. */ rds_cong_remove_conn(conn); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { #if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else #endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; unsigned long flags; size_t i; int j; if (isv6) len /= sizeof(struct rds6_info_message); else len /= sizeof(struct rds_info_message); rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; int npaths; if (!isv6 && conn->c_isv6) continue; npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); for (j = 0; j < npaths; j++) { cp = &conn->c_path[j]; if (want_send) list = &cp->cp_send_queue; else list = &cp->cp_retrans; spin_lock_irqsave(&cp->cp_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) __rds_inc_msg_cp(&rm->m_inc, iter, &conn->c_laddr, &conn->c_faddr, 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } } rcu_read_unlock(); lens->nr = total; if (isv6) lens->each = sizeof(struct rds6_info_message); else lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } #endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 1); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } #endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 0); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 0); } #endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; /* XXX We only copy the information from the first * path for now. The problem is that if there are * more than one underlying paths, we cannot report * information of all of them using the existing * API. For example, there is only one next_tx_seq, * which path's next_tx_seq should we report? It is * a bug in the design of MPRDS. */ cp = conn->c_path; /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; if (conn->c_isv6) return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } #if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; struct rds_connection *conn = cp->cp_conn; cinfo6->next_tx_seq = cp->cp_next_tx_seq; cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); /* Just return 1 as there is no error case. This is a helper function * for rds_walk_conn_path_info() and it wants a return value. */ return 1; } #endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, buffer, sizeof(struct rds_info_connection)); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds6_conn_info_visitor, buffer, sizeof(struct rds6_info_connection)); } #endif int rds_conn_init(void) { int ret; ret = rds_loop_net_init(); /* register pernet callback */ if (ret) return ret; rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif return 0; } void rds_conn_exit(void) { rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); kmem_cache_destroy(rds_conn_slab); rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif } /* * Force a disconnect */ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(cp->cp_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); /* * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); /* Check connectivity of all paths */ void rds_check_all_paths(struct rds_connection *conn) { int i = 0; do { rds_conn_path_connect_if_down(&conn->c_path[i]); } while (++i < conn->c_npaths); } void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); void __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); rds_conn_path_drop(cp, false); }
26 142 1 812 802 421 390 53 53 391 53 31 76 106 106 106 103 13 77 78 24 24 9 9 9 609 609 1 1 1 1 1 7 7 7 240 34 34 33 10 10 1171 1171 8 14 4 18 8 10 24 31 1 1 2 1 55 43 42 52 3 55 9 2 43 43 1088 70 68 2 54 4 22 4 56 8 68 69 24 58 69 1182 1104 611 553 657 70 1176 1180 1184 36 1110 175 1127 1181 1188 1177 1180 19 1186 7 1182 1182 4 1339 711 1176 1182 1178 1179 6 22 2 20 4 3 13 15 3 4 4 5 2 7 474 145 147 61 61 21 42 61 151 301 197 601 1 637 635 641 635 608 43 637 805 798 157 156 20 157 22 59 59 59 98 98 98 98 7 82 12 46 46 1 1 98 98 42 55 48 82 78 32 32 31 7 26 1 27 553 551 551 551 550 551 6 1093 4 4 556 551 551 3 3 3 118 118 5 5 5 1160 1163 1165 116 86 6 6 6 6 435 1185 6 6 6 1 6 5 5 5 5 396 395 396 4 4 4 3 4 1 390 1 1 388 1164 1164 34 29 24 3 3 3 3 2 2 59 59 2 2 2 2 1 1 1 1 1 30 29 186 189 30 32 32 32 30 32 48 189 48 152 1337 1348 1133 145 717 1318 1317 1316 689 1121 1106 1087 26 1093 556 1294 9 1304 660 660 346 99 250 2 157 89 90 243 31 353 345 8 648 656 376 644 651 656 777 450 784 271 677 31 594 779 783 785 782 783 26 26 240 228 240 11 240 243 243 243 243 241 244 68 68 68 3 66 20 48 48 68 6 6 6 5 6 6 6 6 6 11 11 11 5 6 6 6 6 6 5 1 6 12 12 6 11 1 1 1 1 1 29 29 29 7 4 5 5 12 12 12 12 11 12 5 6 12 12 8 12 1 11 11 99 100 100 100 100 532 535 118 120 119 18 14 2 11 11 2 13 9 1 3 13 2 5 63 1 29 8 26 11 15 25 26 2 2 40 3 35 558 669 4 5 4 643 3 13 14 66 32 34 2 2 34 34 601 63 562 51 562 12 563 593 630 639 663 110 6 6 6 6 6 6 1 5 5 628 1 2 2 8 3 612 1 647 584 53 17 602 3 617 30 613 40 557 89 645 5 4 5 4 4 104 576 577 2 2 106 5 5 603 13 2 98 528 379 1 378 380 374 3 7 3 5 7 7 5 5 4 26 3 23 6 22 15 2 5 5 15 14 11 12 10 3 7 19 12 12 12 1 2 1 8 3 1 1 1 2 5 5 5 5 1 5 5 5 5 5 39 39 39 39 36 23 29 39 18 18 1 18 49 21 19 9 112 112 112 112 26 67 18 94 112 77 26 26 13 472 473 473 8 8 8 8 1 1 1 1 1 7 7 7 7 7 7 7 7 7 7 594 593 7 7 7 593 593 579 591 479 457 18 18 955 956 597 415 18 224 66 224 955 812 811 812 159 159 59 58 59 159 159 28 159 159 35 1 1 20 11 8 1 32 2 32 142 2 3 2 2 91 42 129 4 119 13 116 4 4 8 9 97 28 2 111 2 10 115 2 3 116 3 91 28 112 7 109 10 112 6 82 35 111 2 87 25 81 32 101 4 6 13 4 13 6 6 5 1 6 6 6 6 6 6 5 1 6 4 19 13 4 16 12 4 4 7 1 1 13 13 2 6 6 6 6 5 19 13 13 6 5 5 5 4 1 35 17 15 3 3 2 3 6 11 107 14 5 88 23 70 6 8 827 6 818 819 2 2 2 2 819 11 11 861 7 861 2 861 11 117 794 11 851 3 849 6 854 858 3 9 9 4 4 4 11 11 8 844 834 38 850 814 321 850 6 2 857 20 1 5 6 15 15 15 30 29 30 21 15 14 7 11 15 1 25 11 15 4 15 15 15 1 25 33 32 7 10 2 5 16 11 2 13 1 13 13 12 5 59 59 30 25 1 3 18 2 9 15 12 25 2 23 4 4 23 23 4 23 4 23 4 26 10 2 9 9 15 24 12 13 2 11 3 10 826 826 825 826 1 1 1 23 23 23 23 2400 2290 258 125 125 125 125 125 125 125 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * FIB front-end. * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ /* Changes: * * YOSHIFUJI Hideaki @USAGI * reworked default router selection. * - respect outgoing interface * - select from (probably) reachable routers (i.e. * routers in REACHABLE, STALE, DELAY or PROBE states). * - always select the same router if it is (probably) * reachable. otherwise, round-robin the list. * Ville Nuorvala * Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/capability.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/init.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/jhash.h> #include <linux/siphash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/tcp.h> #include <linux/rtnetlink.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> #include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static int ip6_rt_type_to_error(u8 fib6_type); #define CREATE_TRACE_POINTS #include <trace/events/fib6.h> EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); #undef CREATE_TRACE_POINTS enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); #endif struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { struct uncached_list *ul = rt->dst.rt_uncached_list; if (ul) { spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void rt6_uncached_list_flush_dev(struct net_device *dev) { int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt, *safe; if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; bool handled = false; if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; } if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; netdev_ref_replace(rt_dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); handled = true; } if (handled) list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) return &ipv6_hdr(skb)->daddr; return daddr; } struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr) { struct neighbour *n; daddr = choose_neigh_daddr(gw, skb, daddr); n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; n = neigh_create(&nd_tbl, daddr, dev); return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) return; if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) return; __ipv6_confirm_neigh(dev, daddr); } static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .default_advmss = ip6_default_advmss, .neigh_lookup = ip6_dst_neigh_lookup, .check = ip6_dst_check, .destroy = ip6_dst_destroy, .cow_metrics = dst_cow_metrics_generic, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; static const struct rt6_info ip6_null_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES static const struct rt6_info ip6_prohibit_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #endif static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); } /* allocate dst with ip6_dst_ops */ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); } return rt; } EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); if (blackhole_idev) { rt->rt6i_idev = blackhole_idev; in6_dev_put(idev); } } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, READ_ONCE(rt->dst.expires)); return false; } static bool rt6_check_expired(const struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; } static struct fib6_info * rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; iter = rcu_dereference(fn->leaf); if (!iter) goto out; while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference(iter->fib6_next); } out: return NULL; } void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; if (match->nh && have_oif_match && res->nh) return; if (skb) IP6CB(skb)->flags |= IP6SKB_MULTIPATH; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash && (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (unlikely(match->nh)) { nexthop_path_fib6_result(res, fl6->mp_hash); return; } first = rt6_multipath_first_sibling_rcu(match); if (!first) goto out; hash = fl6->mp_hash; if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) match = first; goto out; } list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } out: res->f6i = match; res->nh = match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, const struct in6_addr *saddr, int oif, int flags) { const struct net_device *dev; if (nh->fib_nh_flags & RTNH_F_DEAD) return false; dev = nh->fib_nh_dev; if (oif) { if (dev->ifindex == oif) return true; } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) return true; } return false; } struct fib6_nh_dm_arg { struct net *net; const struct in6_addr *saddr; int oif; int flags; struct fib6_nh *nh; }; static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_dm_arg *arg = _arg; arg->nh = nh; return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, arg->flags); } /* returns fib6_nh from nexthop or NULL */ static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_nh_dm_arg arg = { .net = net, .saddr = saddr, .oif = oif, .flags = flags, }; if (nexthop_is_blackhole(nh)) return NULL; if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) return arg.nh; return NULL; } static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_info *f6i = res->f6i; struct fib6_info *spf6i; struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { bool matched = false; if (unlikely(spf6i->nh)) { nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, oif, flags); if (nh) matched = true; } else { nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) matched = true; } if (matched) { res->f6i = spf6i; goto out; } } if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; goto out; } if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; return; out_blackhole: res->fib6_flags |= RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF struct __rt6_probe_work { struct work_struct work; struct in6_addr target; struct net_device *dev; netdevice_tracker dev_tracker; }; static void rt6_probe_deferred(struct work_struct *w) { struct in6_addr mcaddr; struct __rt6_probe_work *work = container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); netdev_put(work->dev, &work->dev_tracker); kfree(work); } static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it * is really so; aka Router Reachability Probing. * * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); if (!idev) goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out; write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc_obj(*work, GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc_obj(*work, GFP_ATOMIC); } if (!work || cmpxchg(&fib6_nh->last_probe, last_probe, jiffies) != last_probe) { kfree(work); } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); work->dev = dev; schedule_work(&work->work); } out: rcu_read_unlock(); } #else static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif /* * Default Router Selection (RFC 2461 6.3.6) */ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; rcu_read_lock(); neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, &fib6_nh->fib_nh_gw6); if (neigh) { u8 nud_state = READ_ONCE(neigh->nud_state); if (nud_state & NUD_VALID) ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; else ret = RT6_NUD_FAIL_PROBE; #endif } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock(); return ret; } static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict) { int m = 0; if (!oif || nh->fib_nh_dev->ifindex == oif) m = 2; if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif if ((strict & RT6_LOOKUP_F_REACHABLE) && !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } static bool find_match(struct fib6_nh *nh, u32 fib6_flags, int oif, int strict, int *mpri, bool *do_rr) { bool match_do_rr = false; bool rc = false; int m; if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; if (ip6_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; rc = true; } out: return rc; } struct fib6_nh_frl_arg { u32 flags; int oif; int strict; int *mpri; bool *do_rr; struct fib6_nh *nh; }; static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_frl_arg *arg = _arg; arg->nh = nh; return find_match(nh, arg->flags, arg->oif, arg->strict, arg->mpri, arg->do_rr); } static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, int oif, int strict, bool *do_rr, int *mpri) { struct fib6_info *f6i; for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { *cont = f6i; return; } if (fib6_check_expired(f6i)) continue; if (unlikely(f6i->nh)) { struct fib6_nh_frl_arg arg = { .flags = f6i->fib6_flags, .oif = oif, .strict = strict, .mpri = mpri, .do_rr = do_rr }; if (nexthop_is_blackhole(f6i->nh)) { res->fib6_flags = RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->f6i = f6i; res->nh = nexthop_fib6_nh(f6i->nh); return; } if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, &arg)) { matched = true; nh = arg.nh; } } else { nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) matched = true; } if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; res->fib6_type = f6i->fib6_type; } } } static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, struct fib6_info *rr_head, int oif, int strict, bool *do_rr, struct fib6_result *res) { u32 metric = rr_head->fib6_metric; struct fib6_info *cont = NULL; int mpri = -1; __find_rr_leaf(rr_head, NULL, metric, res, &cont, oif, strict, do_rr, &mpri); __find_rr_leaf(leaf, rr_head, metric, res, &cont, oif, strict, do_rr, &mpri); if (res->f6i || !cont) return; __find_rr_leaf(cont, NULL, metric, res, NULL, oif, strict, do_rr, &mpri); } static void rt6_select(struct net *net, struct fib6_node *fn, int oif, struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct fib6_info *rt0; bool do_rr = false; int key_plen; /* make sure this function or its helpers sets f6i */ res->f6i = NULL; if (!leaf || leaf == net->ipv6.fib6_null_entry) goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES if (rt0->fib6_src.plen) key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) goto out; find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } } static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { return (res->f6i->fib6_flags & RTF_NONEXTHOP) || res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) { struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; } /* Sanity check for prefix_len and length */ if (rinfo->length > 3) { return -EINVAL; } else if (rinfo->prefix_len > 128) { return -EINVAL; } else if (rinfo->prefix_len > 64) { if (rinfo->length < 2) { return -EINVAL; } } else if (rinfo->prefix_len > 0) { if (rinfo->length < 1) { return -EINVAL; } } pref = rinfo->route_pref; if (pref == ICMPV6_ROUTER_PREF_INVALID) return -EINVAL; lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; else { /* this function is safe */ ipv6_addr_prefix(&prefix_buf, (struct in6_addr *)rinfo->prefix, rinfo->prefix_len); prefix = &prefix_buf; } if (rinfo->prefix_len == 0) rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { ip6_del_rt(net, rt, false); rt = NULL; } if (!rt && lifetime) rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) rt->fib6_flags = RTF_ROUTEINFO | (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); fib6_remove_gc_list(rt); } else { fib6_set_expires(rt, jiffies + HZ * lifetime); fib6_add_gc_list(rt); } spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } return 0; } #endif /* * Misc support functions */ /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which * case we want dev returned to be dev */ } return dev; } static const int fib6_prop[RTN_MAX + 1] = { [RTN_UNSPEC] = 0, [RTN_UNICAST] = 0, [RTN_LOCAL] = 0, [RTN_BROADCAST] = 0, [RTN_ANYCAST] = 0, [RTN_MULTICAST] = 0, [RTN_BLACKHOLE] = -EINVAL, [RTN_UNREACHABLE] = -EHOSTUNREACH, [RTN_PROHIBIT] = -EACCES, [RTN_THROW] = -EAGAIN, [RTN_NAT] = -EINVAL, [RTN_XRESOLVE] = -EINVAL, }; static int ip6_rt_type_to_error(u8 fib6_type) { return fib6_prop[fib6_type]; } static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; if (rt->dst_nocount) flags |= DST_NOCOUNT; if (rt->dst_nopolicy) flags |= DST_NOPOLICY; return flags; } static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { rt->dst.error = ip6_rt_type_to_error(fib6_type); switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: rt->dst.output = ip6_pkt_prohibit_out; rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: case RTN_UNREACHABLE: default: rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; } } static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; if (res->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } if (res->nh->fib_nh_lws) { rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } rt->dst.lastuse = jiffies; } /* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to f6i in result */ static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; const struct net_device *dev = nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; ip6_rt_init_dst(rt, res); rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_flags = res->fib6_flags; if (nh->fib_nh_gw_family) { rt->rt6i_gateway = nh->fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = f6i->fib6_src; #endif } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) return fn; } } static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { rt = NULL; } *prt = rt; return false; } /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; unsigned short flags; struct rt6_info *nrt; if (!fib6_info_hold_safe(f6i)) goto fallback; flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { fib6_info_release(f6i); goto fallback; } ip6_rt_copy_init(nrt, res); return nrt; fallback: nrt = dev_net(dev)->ipv6.ip6_null_entry; dst_hold(&nrt->dst); return nrt; } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: res.f6i = rcu_dereference(fn->leaf); if (!res.f6i) res.f6i = net->ipv6.fib6_null_entry; else rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, flags); if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; } else if (res.fib6_flags & RTF_REJECT) { goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { do_create: rt = ip6_create_rt_rcu(&res); } out: trace_fib6_table_lookup(net, &res, table, fl6); rcu_read_unlock(); return rt; } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return dst_rt6_info(dst); dst_release(dst); return NULL; } EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. * It takes new route entry, the addition fails by any reason the * route is released. * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; } int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; /* * Clone the route. */ if (!fib6_info_hold_safe(f6i)) return NULL; dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(res)) { if (f6i->fib6_dst.plen != 128 && ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif } return rt; } static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; if (f6i->nh) pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); return pcpu_rt; } static bool rt6_is_valid(const struct rt6_info *rt6) { return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); } /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt; pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); dst_release(&prev->dst); } pcpu_rt = NULL; } return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) return NULL; p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); if (unlikely(prev)) { /* * Another task on this CPU already installed a pcpu_rt. * This can happen on PREEMPT_RT where preemption is possible. * Free our allocation and return the existing one. */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RT)); dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); return prev; } if (res->f6i->fib6_destroying) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } return pcpu_rt; } /* exception hash table implementation */ static DEFINE_SPINLOCK(rt6_exception_lock); /* Remove rt6_ex from hash table and free the memory * Caller must hold rt6_exception_lock */ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); net->ipv6.rt6_stats->fib_rt_cache--; /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; } /* Remove oldest rt6_ex in bucket and free the memory * Caller must hold rt6_exception_lock */ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) { struct rt6_exception *rt6_ex, *oldest = NULL; if (!bucket) return; hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) oldest = rt6_ex; } rt6_remove_exception(bucket, oldest); } static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { static siphash_aligned_key_t rt6_exception_key; struct { struct in6_addr dst; struct in6_addr src; } __aligned(SIPHASH_ALIGNMENT) combined = { .dst = *dst, }; u64 val; net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) combined.src = *src; #endif val = siphash(&combined, sizeof(combined), &rt6_exception_key); return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rt6_exception_lock */ static struct rt6_exception * __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rcu_read_lock() */ static struct rt6_exception * __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; WARN_ON_ONCE(!rcu_read_lock_held()); if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } static unsigned int fib6_mtu(const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; unsigned int mtu; if (res->f6i->fib6_pmtu) { mtu = res->f6i->fib6_pmtu; } else { struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL /* used when the flushed bit is not relevant, only access to the bucket * (ie., all bucket users except rt6_insert_exception); * * called under rcu lock; sometimes called with rt6_exception_lock held */ static struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; if (lock) bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); else bucket = rcu_dereference(nh->rt6i_exception_bucket); /* remove bucket flushed bit if set */ if (bucket) { unsigned long p = (unsigned long)bucket; p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; } return bucket; } static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) { unsigned long p = (unsigned long)bucket; return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); } /* called with rt6_exception_lock held */ static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; unsigned long p; bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); p = (unsigned long)bucket; p |= FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kzalloc_objs(*bucket, FIB6_EXCEPTION_BUCKET_SIZE, GFP_ATOMIC); if (!bucket) { err = -ENOMEM; goto out; } rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } else if (fib6_nh_excptn_bucket_flushed(bucket)) { err = -EINVAL; goto out; } #ifdef CONFIG_IPV6_SUBTREES /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by * a hash of only fib6_dst. */ if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu * is less than f6i's mtu value. */ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_remove_exception(bucket, rt6_ex); rt6_ex = kzalloc_obj(*rt6_ex, GFP_ATOMIC); if (!rt6_ex) { err = -ENOMEM; goto out; } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; /* Randomize max depth to avoid some side channels attacks. */ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } return err; } static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; /* Prevent rt6_insert_exception() to recreate the bucket list */ if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from) rt6_remove_exception(bucket, rt6_ex); } WARN_ON_ONCE(!from && bucket->depth); bucket++; } out: spin_unlock_bh(&rt6_exception_lock); } static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_info *f6i = arg; fib6_nh_flush_exceptions(nh, f6i); return 0; } void rt6_flush_exceptions(struct fib6_info *f6i) { if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); rcu_read_unlock(); } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); } } /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL; #ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * However, the src addr used to create the hash * might not be exactly the passed in saddr which * is a /128 addr from the flow. * So we need to use f6i->fib6_src to redo lookup * if the passed in saddr does not find anything. * (See the logic in ip6_rt_cache_alloc() on how * rt->rt6i_src is updated.) */ if (res->f6i->fib6_src.plen) src_key = saddr; find_ex: #endif bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) ret = rt6_ex->rt6i; #ifdef CONFIG_IPV6_SUBTREES /* Use fib6_src as src_key and redo lookup */ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { src_key = &res->f6i->fib6_src.addr; goto find_ex; } #endif return ret; } /* Remove the passed in cached rt from the hash table that contains it */ static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int err; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) { rt6_remove_exception(bucket, rt6_ex); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&rt6_exception_lock); return err; } struct fib6_nh_excptn_arg { struct rt6_info *rt; int plen; }; static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_excptn_arg *arg = _arg; int err; err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); if (err == 0) return 1; return 0; } static int rt6_remove_exception_rt(struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (from->nh) { struct fib6_nh_excptn_arg arg = { .rt = rt, .plen = from->fib6_src.plen }; int rc; /* rc = 1 means an entry was found */ rc = nexthop_for_each_fib6_nh(from->nh, rt6_nh_remove_exception_rt, &arg); return rc ? 0 : -ENOENT; } return fib6_nh_remove_exception(from->fib6_nh, from->fib6_src.plen, rt); } /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; } struct fib6_nh_match_arg { const struct net_device *dev; const struct in6_addr *gw; struct fib6_nh *match; }; /* determine if fib6_nh has given device and gateway */ static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_match_arg *arg = _arg; if (arg->dev != nh->fib_nh_dev || (arg->gw && !nh->fib_nh_gw_family) || (!arg->gw && nh->fib_nh_gw_family) || (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) return 0; arg->match = nh; /* found a match, break the loop */ return 1; } static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct fib6_info *from; struct fib6_nh *fib6_nh; rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; if (from->nh) { struct fib6_nh_match_arg arg = { .dev = rt->dst.dev, .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); if (!arg.match) goto unlock; fib6_nh = arg.match; } else { fib6_nh = from->fib6_nh; } fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { u32 dmtu = dst6_mtu(&rt->dst); /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. * * If the new MTU is higher, and the route PMTU is equal to the local * MTU, this means the old MTU is the lowest in the path, so allow * updating it: if other nodes now have lower MTUs, PMTU discovery will * handle this. */ if (dmtu >= mtu) return true; if (dmtu == idev->cnf.mtu6) return true; return false; } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected * route), the metrics of its rt->from have already * been updated. */ if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } } #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY && ipv6_addr_equal(gateway, &entry->rt6i_gateway)) { rt6_remove_exception(bucket, rt6_ex); } } bucket++; } } spin_unlock_bh(&rt6_exception_lock); } static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_info *rt = rt6_ex->rt6i; /* we are pruning and obsoleting aged-out and non gateway exceptions * even if others have still references to them, so that on next * dst_check() such references can be dropped. * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { pr_debug("purging route %p via non-router but gateway\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } gc_args->more++; } static void fib6_nh_age_exceptions(const struct fib6_nh *nh, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { rt6_age_examine_exception(bucket, rt6_ex, gc_args, now); } bucket++; } } spin_unlock(&rt6_exception_lock); rcu_read_unlock_bh(); } struct fib6_nh_age_excptn_arg { struct fib6_gc_args *gc_args; unsigned long now; }; static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) { struct fib6_nh_age_excptn_arg *arg = _arg; fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); return 0; } void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { if (f6i->nh) { struct fib6_nh_age_excptn_arg arg = { .gc_args = gc_args, .now = now }; nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, &arg); } else { fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); } } /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; redo_rt6_select: rt6_select(net, fn, oif, res, strict); if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; else if (strict & RT6_LOOKUP_F_REACHABLE) { /* also consider unreachable route */ strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; } } trace_fib6_table_lookup(net, res, table, fl6); return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct rt6_info *rt = NULL; int strict = 0; WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held()); strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); if (res.f6i == net->ipv6.fib6_null_entry) goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ rt6_uncached_list_add(rt); rcu_read_unlock(); return rt; } } else { /* Get a percpu copy */ local_bh_disable(); rt = rt6_get_pcpu_route(&res); if (!rt) rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); } out: if (!rt) rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) ip6_hold_safe(net, &rt); rcu_read_unlock(); return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *keys, struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; struct icmp6hdr _icmph; if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) goto out; icmph = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(*icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; _flkeys = NULL; out: if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; keys->tags.flow_label = _flkeys->tags.flow_label; keys->basic.ip_proto = _flkeys->basic.ip_proto; } else { keys->addrs.v6addrs.src = key_iph->saddr; keys->addrs.v6addrs.dst = key_iph->daddr; keys->tags.flow_label = ip6_flowlabel(key_iph); keys->basic.ip_proto = key_iph->nexthdr; } } static u32 rt6_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 rt6_multipath_custom_hash_fl6(const struct net *net, const struct flowi6 *fl6) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = fl6->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = fl6->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } else { hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { struct flow_keys keys; if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, 0); flkeys = &keys; } /* Inner can be v4 or v6 */ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.tags.flow_label = flkeys->tags.flow_label; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = rt6_multipath_custom_hash_skb(net, skb); else mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } return mhash >> 1; } /* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } static struct dst_entry *ip6_route_output_flags_noref(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { bool any_src; if (ipv6_addr_type(&fl6->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; } fl6->flowi6_iif = LOOPBACK_IFINDEX; flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { struct dst_entry *dst; struct rt6_info *rt6; rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); } rcu_read_unlock(); return dst; } EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif } dst_release(dst_orig); return new ? new : ERR_PTR(-ENOMEM); } /* * Destination cache support functions */ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) return false; if (fib6_check_expired(f6i)) return false; return true; } static struct dst_entry *rt6_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { u32 rt_cookie = 0; if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) return NULL; return &rt->dst; } static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { if (!__rt6_check_expired(rt) && READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; struct rt6_info *rt; rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->dst.rt_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); return dst_ret; } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { /* rt/dst can not be destroyed yet, * because of rcu_read_lock() */ sk_dst_reset(sk); rt6_remove_exception_rt(rt); } rcu_read_unlock(); return; } sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; from = rcu_dereference(rt->from); if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); } } static void rt6_update_expires(struct rt6_info *rt0, int timeout) { if (!(rt0->rt6i_flags & RTF_EXPIRES)) { struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; rt6_update_expires(rt, READ_ONCE(net->ipv6.sysctl.ip6_rt_mtu_expires)); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. * [see also comment in rt6_mtu_change_route()] */ if (iph) { daddr = &iph->daddr; saddr = &iph->saddr; } else if (sk) { daddr = &sk->sk_v6_daddr; saddr = &inet6_sk(sk)->saddr; } else { daddr = NULL; saddr = NULL; } if (confirm_neigh) dst_confirm_neigh(dst, daddr); if (mtu < IPV6_MIN_MTU) return; if (mtu >= dst6_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); /* update rt6_ex->stamp for cache */ if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); if (!res.f6i) goto out_unlock; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ if (!arg.match) goto out_unlock; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } out_unlock: rcu_read_unlock(); } } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_oif = oif, .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { int oif = sk->sk_bound_dev_if; struct dst_entry *dst; if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), sk_uid(sk)); dst = __sk_dst_get(sk); if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; bh_lock_sock(sk); if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ip6_datagram_dst_update(sk, false); bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { #ifdef CONFIG_IPV6_SUBTREES struct ipv6_pinfo *np = inet6_sk(sk); #endif ip6_dst_store(sk, dst, ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : #endif false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, struct flowi6 *fl6, const struct in6_addr *gw, struct rt6_info **ret) { const struct fib6_nh *nh = res->nh; if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || fl6->flowi6_oif != nh->fib_nh_dev->ifindex) return false; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); if (rt_cache && ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { *ret = rt_cache; return true; } return false; } return true; } struct fib6_nh_rd_arg { struct fib6_result *res; struct flowi6 *fl6; const struct in6_addr *gw; struct rt6_info **ret; }; static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_rd_arg *arg = _arg; arg->res->nh = nh; return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); } /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; struct in6_addr gateway; }; INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; struct fib6_nh_rd_arg arg = { .res = &res, .fl6 = fl6, .gw = &rdfl->gateway, .ret = &ret }; struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. * Due to the way the routes are chosen, this notion * is a bit fuzzy and one might need to check all possible * routes. */ rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; if (unlikely(rt->nh)) { if (nexthop_is_blackhole(rt->nh)) continue; /* on match, res->nh is filled in and potentially ret */ if (nexthop_for_each_fib6_nh(rt->nh, fib6_nh_redirect_match, &arg)) goto out; } else { res.nh = rt->fib6_nh; if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) goto out; } } if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } res.f6i = rt; res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); } else { res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; ret = ip6_create_rt_rcu(&res); } rcu_read_unlock(); trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; rdfl.fl6 = *fl6; rdfl.gateway = *gateway; return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .flowi6_mark = mark, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { unsigned int mtu = dst6_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); rcu_read_lock(); net = dst_dev_net_rcu(dst); mtu = max_t(unsigned int, mtu, READ_ONCE(net->ipv6.sysctl.ip6_rt_min_advmss)); rcu_read_unlock(); /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. * IPV6_MAXPLEN is also valid and means: "any MSS, * rely only on pmtu discovery" */ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) mtu = IPV6_MAXPLEN; return mtu; } INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device * * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct fib6_nh *nh = res->nh; struct fib6_info *f6i = res->f6i; struct inet6_dev *idev; struct rt6_info *rt; u32 mtu = 0; if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { mtu = f6i->fib6_pmtu; if (mtu) goto out; } rt = rt6_find_cached_rt(res, daddr, saddr); if (unlikely(rt)) { mtu = dst_metric_raw(&rt->dst, RTAX_MTU); } else { struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); if (idev) mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: return dst; } static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_min_interval); int rt_elasticity = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_elasticity); int rt_gc_timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_timeout); unsigned long rt_last_gc = READ_ONCE(net->ipv6.ip6_rt_last_gc); unsigned int val; int entries; if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, u32 tbid, int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; int err; table = fib6_get_table(net, tbid); if (!table) return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) fib6_select_path(net, res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); return err; } static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; struct fib6_result res = {}; int err; err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && res.fib6_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); err = -EINVAL; } return err; } static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; int flags = RT6_LOOKUP_F_IFACE; struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { err = ip6_nh_lookup_table(net, cfg, gw_addr, cfg->fc_table, flags, &res); /* gw_addr can not require a gateway or resolve to a reject * route. If a device is given, it must match the result. */ if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family || (dev && dev != res.nh->fib_nh_dev)) err = -EHOSTUNREACH; } if (err < 0) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, }; err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family) err = -EHOSTUNREACH; if (err) return err; fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); } err = 0; if (dev) { if (dev != res.nh->fib_nh_dev) err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } return err; } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; int gwa_type = ipv6_addr_type(gw_addr); bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; const struct net_device *dev = *_dev; bool need_addr_check = !dev; int err = -EINVAL; /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() * will return already-added prefix route via interface that * prefix route was assigned to, which might be non-loopback. */ if (dev && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { /* IPv6 strictly inhibits using not link-local * addresses as nexthop address. * Otherwise, router will not able to send redirects. * It is very good, but in some (rare!) circumstances * (SIT, PtP, NBMA NOARP links) it is handy to allow * some exceptions. --ANK * We allow IPv4-mapped nexthops to support RFC4798-type * addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; } rcu_read_lock(); if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, idev); rcu_read_unlock(); if (err) goto out; } /* reload in case device was changed */ dev = *_dev; err = -EINVAL; if (!dev) { NL_SET_ERR_MSG(extack, "Egress device not specified"); goto out; } else if (dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Egress device can not be loopback device for this route"); goto out; } /* if we did not check gw_addr above, do so now that the * egress device has been resolved. */ if (need_addr_check && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } err = 0; out: return err; } static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) { if ((flags & RTF_REJECT) || (dev && (dev->flags & IFF_LOOPBACK) && !(addr_type & IPV6_ADDR_LOOPBACK) && !(flags & (RTF_ANYCAST | RTF_LOCAL)))) return true; return false; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif if (cfg->fc_is_fdb) { fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; return 0; } err = -ENODEV; if (cfg->fc_ifindex) { dev = netdev_get_by_index(net, cfg->fc_ifindex, dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); if (!idev) goto out; } if (cfg->fc_flags & RTNH_F_ONLINK) { if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); goto out; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; } fib6_nh->fib_nh_weight = 1; /* We cannot add true routes via loopback here, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; goto out; } } goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { err = ip6_validate_gw(net, cfg, &dev, dev_tracker, &idev, extack); if (err) goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; if (!dev) goto out; if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; } if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; pcpu_alloc: fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { err = -ENOMEM; goto out; } fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: if (idev) in6_dev_put(idev); if (err) { fib_nh_common_release(&fib6_nh->nh_common); fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } return err; } void fib6_nh_release(struct fib6_nh *fib6_nh) { struct rt6_exception_bucket *bucket; rcu_read_lock(); fib6_nh_flush_exceptions(fib6_nh, NULL); bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); if (bucket) { rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); kfree(bucket); } rcu_read_unlock(); fib6_nh_release_dsts(fib6_nh); free_percpu(fib6_nh->rt6i_pcpu); fib_nh_common_release(&fib6_nh->nh_common); } void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) { int cpu; if (!fib6_nh->rt6i_pcpu) return; for_each_possible_cpu(cpu) { struct rt6_info *pcpu_rt, **ppcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = xchg(ppcpu_rt, NULL); if (pcpu_rt) { dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); } } } static int fib6_config_validate(struct fib6_config *cfg, struct netlink_ext_ack *extack) { /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto errout; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); goto errout; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); goto errout; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto errout; } #ifdef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); goto errout; } if (cfg->fc_nh_id && cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); goto errout; } #else if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); goto errout; } #endif return 0; errout: return -EINVAL; } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_table *table; struct fib6_info *rt; int err; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); if (!table) { pr_warn("NLM_F_CREATE should be specified when creating new route\n"); table = fib6_new_table(net, cfg->fc_table); } } else { table = fib6_new_table(net, cfg->fc_table); } if (!table) { err = -ENOBUFS; goto err; } rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); if (!rt) { err = -ENOMEM; goto err; } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); goto free; } if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif return rt; free: kfree(rt); err: return ERR_PTR(err); } static int ip6_route_info_create_nh(struct fib6_info *rt, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_nh *fib6_nh; int err; if (cfg->fc_nh_id) { struct nexthop *nh; rcu_read_lock(); nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { err = -EINVAL; NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } err = fib6_check_nexthop(nh, cfg, extack); if (err) goto out_free; if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); rcu_read_unlock(); } else { int addr_type; err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out_release; fib6_nh = rt->fib6_nh; /* We cannot add true routes via loopback here, they would * result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; } return 0; out_release: fib6_info_release(rt); return err; out_free: rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_info *rt; int err; err = fib6_config_validate(cfg, extack); if (err) return err; rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); if (err) return err; err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); return err; } static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; int err; if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); out: fib6_info_release(rt); return err; } int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { struct nl_info info = { .nl_net = net, .skip_notify = skip_notify }; return __ip6_del_rt(rt, &info); } static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; int err = -ENOENT; if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); skb = NULL; } else info->skip_notify = 1; } /* 'rt' points to the first sibling route. If it is not the * leaf, then we do not need to send a notification. Otherwise, * we need to check if the last sibling has a next route or not * and emit a replace or delete notification, respectively. */ info->skip_notify_kernel = 1; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (rcu_access_pointer(fn->leaf) == rt) { struct fib6_info *last_sibling, *replace_rt; last_sibling = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); replace_rt = rcu_dereference_protected( last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock)); if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); else call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL); } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; } } err = fib6_del(rt, info); out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; } static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) goto out; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; rc = rt6_remove_exception_rt(rt); out: return rc; } static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, struct fib6_nh *nh) { struct fib6_result res = { .f6i = rt, .nh = nh, }; struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); if (rt_cache) return __ip6_del_cached_rt(rt_cache, cfg); return 0; } struct fib6_nh_del_cached_rt_arg { struct fib6_config *cfg; struct fib6_info *f6i; }; static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_del_cached_rt_arg *arg = _arg; int rc; rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); return rc != -ESRCH ? rc : 0; } static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) { struct fib6_nh_del_cached_rt_arg arg = { .cfg = cfg, .f6i = f6i }; return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); } static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); if (!table) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; } rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, !(cfg->fc_flags & RTF_CACHE)); if (fn) { for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; if (rt->nh && cfg->fc_nh_id && rt->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_flags & RTF_CACHE) { int rc = 0; if (rt->nh) { rc = ip6_del_cached_rt_nh(cfg, rt); } else if (cfg->fc_nh_id) { continue; } else { nh = rt->fib6_nh; rc = ip6_del_cached_rt(cfg, rt, nh); } if (rc != -ESRCH) { rcu_read_unlock(); return rc; } continue; } if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; err = __ip6_del_rt(rt, &cfg->fc_nlinfo); break; } if (cfg->fc_nh_id) continue; nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (!fib6_info_hold_safe(rt)) continue; /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) err = __ip6_del_rt(rt, &cfg->fc_nlinfo); else err = __ip6_del_rt_siblings(rt, cfg); break; } } rcu_read_unlock(); return err; } static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); return; } msg = (struct rd_msg *)icmp6_hdr(skb); if (ipv6_addr_is_multicast(&msg->dest)) { net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); return; } on_link = 0; if (ipv6_addr_equal(&msg->dest, &msg->target)) { on_link = 1; } else if (ipv6_addr_type(&msg->target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); return; } in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; if (READ_ONCE(in6_dev->cnf.forwarding) || !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: * The IP source address of the Redirect MUST be the same as the current * first-hop router for the specified ICMP Destination Address. */ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } lladdr = NULL; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, skb->dev); if (!lladdr) { net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); return; } } rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } /* Redirect received -> path was valid. * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) return; /* * We have finally decided to accept it. */ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); rcu_read_lock(); res.f6i = rcu_dereference(rt->from); if (!res.f6i) goto out; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev. Should be impossible */ if (!arg.match) goto out; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* rt6_insert_exception() will take care of duplicated exceptions */ if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } netevent.old = &rt->dst; netevent.new = &nrt->dst; netevent.daddr = &msg->dest; netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: rcu_read_unlock(); neigh_release(neigh); } #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; for_each_fib6_node_rt_rcu(fn) { /* these routes do not use nexthops */ if (rt->nh) continue; if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || !rt->fib6_nh->fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; break; } out: rcu_read_unlock(); return rt; } static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { struct fib6_config cfg = { .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, }; cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) continue; nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) rt = NULL; rcu_read_unlock(); return rt; } struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, u32 defrtr_usr_metric, int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); if (table) table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } return rt6_get_dflt_router(net, gwaddr, dev); } static void __rt6_purge_dflt_routers(struct net *net, struct fib6_table *table) { struct fib6_info *rt; restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct net_device *dev = fib6_info_nh_dev(rt); struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt, false); goto restart; } } rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } void rt6_purge_dflt_routers(struct net *net) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) __rt6_purge_dflt_routers(net, table); } } rcu_read_unlock(); } static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { *cfg = (struct fib6_config){ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, .fc_flags = rtmsg->rtmsg_flags, .fc_type = rtmsg->rtmsg_type, .fc_nlinfo.nl_net = net, .fc_dst = rtmsg->rtmsg_dst, .fc_src = rtmsg->rtmsg_src, .fc_gateway = rtmsg->rtmsg_gateway, }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; int err; if (cmd != SIOCADDRT && cmd != SIOCDELRT) return -EINVAL; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtmsg_to_fib6_config(net, rtmsg, &cfg); switch (cmd) { case SIOCADDRT: /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } return err; } /* * Drop the packet on the floor */ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { SKB_DR_SET(reason, IP_INADDRERRORS); IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } SKB_DR_SET(reason, IP_INNOROUTES); fallthrough; case IPSTATS_MIB_OUTNOROUTES: SKB_DR_OR(reason, IP_OUTNOROUTES); IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } /* Start over by dropping the dst for l3mdev case */ if (netif_is_l3_master(skb->dev)) skb_dst_drop(skb); icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb_reason(skb, reason); return 0; } static int ip6_pkt_discard(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } /* * Allocate a dst for local (unicast / anycast) address. */ struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, .fc_ifindex = idev->dev->ifindex, .fc_flags = RTF_UP | RTF_NONEXTHOP, .fc_dst = *addr, .fc_dst_len = 128, .fc_protocol = RTPROT_KERNEL, .fc_nlinfo.nl_net = net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; cfg.fc_flags |= RTF_ANYCAST; } else { cfg.fc_type = RTN_LOCAL; cfg.fc_flags |= RTF_LOCAL; } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (IS_ERR(f6i)) return f6i; err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); if (err) return ERR_PTR(err); f6i->dst_nocount = true; if (!anycast && (READ_ONCE(net->ipv6.devconf_all->disable_policy) || READ_ONCE(idev->cnf.disable_policy))) f6i->dst_nopolicy = true; return f6i; } /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; spin_unlock_bh(&rt6_exception_lock); } return 0; } void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { .net = net, .addr = &ifp->addr, }; fib6_clean_all(net, fib6_remove_prefsrc, &adni); } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) return 0; nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) { fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_netdev_event { const struct net_device *dev; union { unsigned char nh_flags; unsigned long event; }; }; static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; } /* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; } static int rt6_multipath_total_weight(const struct fib6_info *rt) { struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh->fib_nh_weight; } return total; } static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; if (!rt6_is_dead(rt)) { *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } void rt6_multipath_rebalance(struct fib6_info *rt) { struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, * then there is no need to rebalance upon the removal of every * sibling route. */ if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to * make sure upper bounds are assigned from the first sibling * onwards. */ first = rt6_multipath_first_sibling(rt); if (WARN_ON_ONCE(!first)) return; total = rt6_multipath_total_weight(first); rt6_multipath_upper_bound_set(first, total); } static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && !rt->nh && rt->fib6_nh->fib_nh_dev == arg->dev) { rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } return 0; } void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, { .nh_flags = nh_flags, }, }; if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) arg.nh_flags |= RTNH_F_LINKDOWN; fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } /* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; } static void rt6_multipath_flush(struct fib6_info *rt) { struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh->fib_nh_dev == down_dev || rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == down_dev || iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; } static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned char nh_flags) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } return 0; } void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, { .event = event, }, }; struct net *net = dev_net(dev); if (READ_ONCE(net->ipv6.sysctl.skip_notify_on_dev_down)) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); rt6_uncached_list_flush_dev(dev); neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; struct fib6_info *f6i; }; static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; struct fib6_info *f6i = arg->f6i; /* For administrative MTU increase, there is no way to discover * IPv6 PMTU increase, so PMTU increase should be updated here. * Since RFC 1981 doesn't include administrative MTU increase * update PMTU increase is a MUST. (i.e. jumbo frame) */ if (nh->fib_nh_dev == arg->dev) { struct inet6_dev *idev = __in6_dev_get(arg->dev); u32 mtu = f6i->fib6_pmtu; if (mtu >= arg->mtu || (mtu < arg->mtu && mtu == idev->cnf.mtu6)) fib6_metric_set(f6i, RTAX_MTU, arg->mtu); spin_lock_bh(&rt6_exception_lock); rt6_exceptions_update_pmtu(idev, nh, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; } static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. We still use this lock to block changes caused by addrconf/ndisc. */ idev = __in6_dev_get(arg->dev); if (!idev) return 0; if (fib6_metric_locked(f6i, RTAX_MTU)) return 0; arg->f6i = f6i; if (f6i->nh) { /* fib6_nh_mtu_change only returns 0, so this is safe */ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, arg); } return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, struct netlink_ext_ack *extack, bool newroute) { struct rtnexthop *rtnh; int remaining; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); return -EINVAL; } do { bool has_gateway = cfg->fc_flags & RTF_GATEWAY; int attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs; attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { if (nla_len(nla) < sizeof(cfg->fc_gateway)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); return -EINVAL; } has_gateway = true; } } if (newroute && (cfg->fc_nh_id || !has_gateway)) { NL_SET_ERR_MSG(extack, "Device only routes can not be added for IPv6 using the multipath API."); return -EINVAL; } rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; struct rtmsg *rtm; unsigned int pref; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_tos) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): option not available for IPv6"); goto errout; } if (tb[RTA_FLOWLABEL]) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Flow label cannot be specified for this operation"); goto errout; } *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, .fc_src_len = rtm->rtm_src_len, .fc_flags = RTF_UP, .fc_protocol = rtm->rtm_protocol, .fc_type = rtm->rtm_type, .fc_nlinfo.portid = NETLINK_CB(skb).portid, .fc_nlinfo.nlh = nlh, .fc_nlinfo.nl_net = sock_net(skb->sk), }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || rtm->rtm_type == RTN_PROHIBIT || rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); if (tb[RTA_NH_ID]) { if (tb[RTA_GATEWAY] || tb[RTA_OIF] || tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); goto errout; } cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); } if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } if (tb[RTA_VIA]) { NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); goto errout; } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; if (nla_len(tb[RTA_DST]) < plen) goto errout; nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); } if (tb[RTA_SRC]) { int plen = (rtm->rtm_src_len + 7) >> 3; if (nla_len(tb[RTA_SRC]) < plen) goto errout; nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } if (tb[RTA_PREFSRC]) cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_PRIORITY]) cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); if (tb[RTA_METRICS]) { cfg->fc_mx = nla_data(tb[RTA_METRICS]); cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); } if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } if (tb[RTA_PREF]) { pref = nla_get_u8(tb[RTA_PREF]); if (pref != ICMPV6_ROUTER_PREF_LOW && pref != ICMPV6_ROUTER_PREF_HIGH) pref = ICMPV6_ROUTER_PREF_MEDIUM; cfg->fc_flags |= RTF_PREF(pref); } if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); if (addrconf_finite_timeout(timeout)) { cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); cfg->fc_flags |= RTF_EXPIRES; } } err = 0; errout: return err; } struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head list; }; static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return -EEXIST; } nh = kzalloc_obj(*nh); if (!nh) return -ENOMEM; nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->list, rt6_nh_list); return 0; } static void ip6_route_mpath_notify(struct fib6_info *rt, struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { /* if this is an APPEND route, then rt points to the first route * inserted and rt_last points to last route inserted. Userspace * wants a consistent dump of the route which starts at the first * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ rcu_read_lock(); if ((nlflags & NLM_F_APPEND) && rt_last && READ_ONCE(rt_last->fib6_nsiblings)) { rt = list_first_or_null_rcu(&rt_last->fib6_siblings, struct fib6_info, fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) { bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool should_notify = false; struct fib6_info *leaf; struct fib6_node *fn; rcu_read_lock(); fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; leaf = rcu_dereference(fn->leaf); if (!leaf) goto out; if (rt == leaf || (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf))) should_notify = true; out: rcu_read_unlock(); return should_notify; } static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; int replace; int nhn = 0; int err; err = fib6_config_validate(cfg, extack); if (err) return err; replace = (cfg->fc_nlinfo.nlh && (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto cleanup; } err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); if (err) { rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; } rtnh = rtnh_next(rtnh, &remaining); } /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; /* For add and replace, send one notification with all nexthops. For * append, send one notification with all appended nexthops. */ info->skip_notify_kernel = 1; err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, "multipath route replace failed (check consistency of installed routes)"); err_nh = nh; goto add_errout; } /* save reference to last route successfully inserted */ rt_last = nh->fib6_info; /* save reference to first route for notification */ if (!rt_notif) rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: * fib6_add_rt2node() has rejected it; when replacing, old * nexthops have been replaced by first new, the rest should * be added to it. */ if (cfg->fc_nlinfo.nlh) { cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; } nhn++; } /* An in-kernel notification should only be sent in case the new * multipath route is added as the first route in the node, or if * it was appended to it. We pass 'rt_notif' since it is the first * sibling and might allow us to skip some checks in the replace case. */ if (ip6_route_mpath_should_notify(rt_notif)) { enum fib_event_type fib_event; if (rt_notif->fib6_nsiblings != nhn - 1) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_multipath_entry_notifiers(info->nl_net, fib_event, rt_notif, nhn - 1, extack); if (err) { /* Delete all the siblings that were just added */ err_nh = NULL; goto add_errout; } } /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; add_errout: /* send notification for routes that were added so that * the delete notifications sent by ip6_route_del are * coherent */ if (rt_notif) ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); list_del(&nh->list); kfree(nh); } return err; } static int ip6_route_multipath_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; int last_err = 0; int remaining; int attrlen; int err; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; rtnh = rtnh_next(rtnh, &remaining); } return last_err; } static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_nh_id) { rcu_read_lock(); err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); return -EINVAL; } } if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } } static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else return ip6_route_add(&cfg, GFP_KERNEL, extack); } /* add the overhead of this fib6_nh to nexthop_len */ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { int *nexthop_len = arg; *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16); /* RTA_GATEWAY */ if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); /* RTA_ENCAP */ *nexthop_len += nla_total_size(2); } return 0; } static size_t rt6_nlmsg_size(struct fib6_info *f6i) { struct fib6_info *sibling; struct fib6_nh *nh; int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); goto common; } rcu_read_lock(); retry: nh = f6i->fib6_nh; nexthop_len = 0; if (READ_ONCE(f6i->fib6_nsiblings)) { rt6_nh_nlmsg_size(nh, &nexthop_len); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); if (!READ_ONCE(f6i->fib6_nsiblings)) goto retry; } } rcu_read_unlock(); nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); common: return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ + nla_total_size(16) /* RTA_GATEWAY */ + nla_total_size(16) /* RTA_PREFSRC */ + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_IIF */ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ + nexthop_len; } static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, unsigned char *flags) { if (nexthop_is_multipath(nh)) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) goto nla_put_failure; nla_nest_end(skb, mp); } else { struct fib6_nh *fib6_nh; fib6_nh = nexthop_fib6_nh(nh); if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, flags, false) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; if (rt6) { rt6_dst = &rt6->rt6i_dst; rt6_src = &rt6->rt6i_src; rt6_flags = rt6->rt6i_flags; } else { rt6_dst = &rt->fib6_dst; rt6_src = &rt->fib6_src; rt6_flags = rt->fib6_flags; } rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; rtm->rtm_dst_len = rt6_dst->plen; rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) return 0; if (err < 0) goto nla_put_failure; } else #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { struct net_device *dev; if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; dev = dst_dev(dst); if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (READ_ONCE(rt->fib6_nsiblings)) { struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, rt->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } else { if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, &nh_flags, false) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } if (!dst) { if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) { const struct net_device *dev = arg; if (nh->fib_nh_dev == dev) return 1; return 0; } static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { if (f6i->nh) { struct net_device *_dev = (struct net_device *)dev; return !!nexthop_for_each_fib6_nh(f6i->nh, fib6_info_nh_uses_dev, _dev); } if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (READ_ONCE(f6i->fib6_nsiblings)) { const struct fib6_info *sibling; rcu_read_lock(); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { if (sibling->fib6_nh->fib_nh_dev == dev) { rcu_read_unlock(); return true; } if (!READ_ONCE(f6i->fib6_nsiblings)) break; } rcu_read_unlock(); } return false; } struct fib6_nh_exception_dump_walker { struct rt6_rtnl_dump_arg *dump; struct fib6_info *rt; unsigned int flags; unsigned int skip; unsigned int count; }; static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_nh_exception_dump_walker *w = arg; struct rt6_rtnl_dump_arg *dump = w->dump; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i, err; bucket = fib6_nh_get_excptn_bucket(nh, NULL); if (!bucket) return 0; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (w->skip) { w->skip--; continue; } /* Expiration of entries doesn't bump sernum, insertion * does. Removal is triggered by insertion, so we can * rely on the fact that if entries change between two * partial dumps, this node is scanned again completely, * see rt6_insert_exception() and fib6_dump_table(). * * Count expired entries we go through as handled * entries that we'll skip next time, in case of partial * node dump. Otherwise, if entries expire meanwhile, * we'll skip the wrong amount. */ if (rt6_check_expired(rt6_ex->rt6i)) { w->count++; continue; } err = rt6_fill_node(dump->net, dump->skb, w->rt, &rt6_ex->rt6i->dst, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(dump->cb->skb).portid, dump->cb->nlh->nlmsg_seq, w->flags); if (err) return err; w->count++; } bucket++; } return 0; } /* Return -1 if done with node, number of handled routes on partial dump */ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; int count = 0; if (rt == net->ipv6.fib6_null_entry) return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return -1; } if (filter->filter_set && ((filter->rt_type && rt->fib6_type != filter->rt_type) || (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || (filter->protocol && rt->fib6_protocol != filter->protocol))) { return -1; } if (filter->filter_set || !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } if (filter->dump_routes) { if (skip) { skip--; } else { if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, flags)) { return 0; } count++; } } if (filter->dump_exceptions) { struct fib6_nh_exception_dump_walker w = { .dump = arg, .rt = rt, .flags = flags, .skip = skip, .count = 0 }; int err; rcu_read_lock(); if (rt->nh) { err = nexthop_for_each_fib6_nh(rt->nh, rt6_nh_dump_exceptions, &w); } else { err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); } rcu_read_unlock(); if (err) return count + w.count; } return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } if (tb[RTA_FLOWLABEL] && (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Invalid flow label"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_IIF: case RTA_OIF: case RTA_MARK: case RTA_UID: case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (tb[RTA_UID]) fl6.flowi6_uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); if (tb[RTA_SPORT]) fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &fl6.flowi6_proto, AF_INET6, extack); if (err) goto errout; } flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); if (iif) { struct net_device *dev; int flags = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, iif); if (!dev) { rcu_read_unlock(); err = -ENODEV; goto errout; } fl6.flowi6_iif = iif; if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; dst = ip6_route_output(net, NULL, &fl6); } rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } if (rt == net->ipv6.ip6_null_entry) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); err = -ENOBUFS; goto errout; } skb_dst_set(skb, &rt->dst); rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { if (fibmatch) err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } else { err = -ENETUNREACH; } rcu_read_unlock(); if (err < 0) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct net *net = info->nl_net; struct sk_buff *skb; size_t sz; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; rcu_read_lock(); sz = rt6_nlmsg_size(rt); retry: skb = nlmsg_new(sz, GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { kfree_skb(skb); /* -EMSGSIZE implies needed space grew under us. */ if (err == -EMSGSIZE) { sz = max(rt6_nlmsg_size(rt), sz << 1); goto retry; } goto errout; } rcu_read_unlock(); rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { u8 fib_notify_on_flag_change; struct sk_buff *skb; int err; if (READ_ONCE(f6i->offload) == offload && READ_ONCE(f6i->trap) == trap && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); fib_notify_on_flag_change = READ_ONCE(net->ipv6.sysctl.fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send * notification. */ return; if (!fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } EXPORT_SYMBOL(fib6_info_hw_flags_set); static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (!(dev->flags & IFF_LOOPBACK)) return NOTIFY_OK; if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } else if (event == NETDEV_UNREGISTER && dev->reg_state != NETREG_UNREGISTERED) { /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } return NOTIFY_OK; } /* * /proc */ #ifdef CONFIG_PROC_FS static int rt6_stats_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; int ret; if (!write) return -EINVAL; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; net = (struct net *)ctl->extra1; delay = READ_ONCE(net->ipv6.sysctl.flush_delay); fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } static struct ctl_table ipv6_route_table_template[] = { { .procname = "max_size", .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", .data = &ip6_dst_ops_template.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_timeout", .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_elasticity", .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "skip_notify_on_dev_down", .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_route_table_template, sizeof(ipv6_route_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.flush_delay; table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; } return table; } size_t ipv6_route_sysctl_table_size(struct net *net) { /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) return 1; return ARRAY_SIZE(ipv6_route_table_template); } #endif static int __net_init ip6_route_net_init(struct net *net) { int ret = -ENOMEM; memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_SUBTREES net->ipv6.fib6_routes_require_src = 0; #endif #endif net->ipv6.sysctl.flush_delay = 0; net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: return ret; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_ip6_prohibit_entry: kfree(net->ipv6.ip6_prohibit_entry); out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif out_fib6_null_entry: kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } static void __net_exit ip6_route_net_exit(struct net *net) { kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, sizeof(struct ipv6_route_iter))) return -ENOMEM; if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, rt6_stats_seq_show, NULL)) { remove_proc_entry("ipv6_route", net->proc_net); return -ENOMEM; } #endif return 0; } static void __net_exit ip6_route_net_exit_late(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ipv6_route", net->proc_net); remove_proc_entry("rt6_stats", net->proc_net); #endif } static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, }; static int __net_init ipv6_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc_obj(*bp); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv6.peers = bp; return 0; } static void __net_exit ipv6_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv6.peers; net->ipv6.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, }; static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; void __init ip6_route_init_special_entries(void) { /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif } #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), }; static struct bpf_iter_reg ipv6_route_reg_info = { .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) { bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip6_route_init(void) { int ret; int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; ret = register_pernet_subsys(&ipv6_inetpeer_ops); if (ret) goto out_dst_entries; ret = register_pernet_subsys(&ip6_route_net_ops); if (ret) goto out_register_inetpeer; ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; ret = fib6_init(); if (ret) goto out_register_subsys; ret = xfrm6_init(); if (ret) goto out_fib6_init; ret = fib6_rules_init(); if (ret) goto xfrm6_init; ret = register_pernet_subsys(&ip6_route_net_late_ops); if (ret) goto fib6_rules_init; ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); if (ret) goto out_register_late_subsys; #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) goto out_register_late_subsys; #endif #endif for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } out: return ret; out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: xfrm6_fini(); out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); out_register_inetpeer: unregister_pernet_subsys(&ipv6_inetpeer_ops); out_dst_entries: dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; } void ip6_route_cleanup(void) { #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_unregister(); #endif #endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ipv6_inetpeer_ops); unregister_pernet_subsys(&ip6_route_net_ops); dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); }
140 15 203 46 14 7 14 21 13 8 249 237 44 37 36 71 416 384 33 260 222 148 60 11 12 197 22 11 11 57 4 15 9 16 16 20 12 36 36 36 2 13 13 13 1 28 22 14 4 24 11 2 2 2 3 165 125 125 22 22 3 16 9 16 16 8 4 22 22 3 2 22 22 21 3 3 24 22 22 26 1 1 24 24 23 24 24 5 24 2 24 6 1 1 5 5 5 5 5 2 3 11 11 8 11 11 11 11 11 11 11 9 2 1 4 4 4 4 2 2 31 8 20 3 20 6 2 16 5 10 19 21 21 5 16 9 4 3 5 17 4 14 7 21 15 14 5 5 1 4 5 5 1 4 1 4 5 5 9 12 12 10 13 2 2 12 10 6 4 10 13 13 13 13 13 1 10 13 13 10 156 2 2410 2407 381 379 381 256 156 379 4 4 2 3 3 3 11 3 4 4 5 4 5 4 5 4 64 64 102 4 98 1 4 8 10 11 2 52 10 2 22 60 55 7 48 21 5 15 5 1 1 26 7 4 1 1 1 2 2 2 1 3 3 1 2 3 3 1 1 22 1 21 3 3 1 3 2 2 1 2 3 1 1 1 2 3 1 3 3 3 3 2 2 1 2 2 2 2 2 2 3 3 3 3 1 2 3 15 15 44 7 38 30 7 12 8 19 5 6 16 2 13 1 1 8 8 1 7 7 1 1 1 8 1 1 1 1 2 2 2 1 4 2 16 5 4 4 8 3 9 2 11 2 3 5 4 5 4 4 5 9 34 2 27 5 44 28 3 9 3 1 31 31 14 2 2 16 1 8 6 1 8 6 5 1 1 8 125 125 125 125 125 125 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the NetFilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> #include <net/net_namespace.h> #include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/route.h> #include <net/sock.h> #include <net/genetlink.h> #include <linux/uaccess.h> #include <net/ip_vs.h> MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); static struct lock_class_key __ipvs_service_key; /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; int ip_vs_get_debug_level(void) { return sysctl_ip_vs_debug_level; } #endif /* Protos */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static bool __ip_vs_addr_is_local_v6(struct net *net, const struct in6_addr *addr) { struct flowi6 fl6 = { .daddr = *addr, }; struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); bool is_local; is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); dst_release(dst); return is_local; } #endif #ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; int amemthresh; int nomem; int to_change = -1; /* we only count free and buffered memory (in pages) */ si_meminfo(&i); availmem = i.freeram + i.bufferram; /* however in linux 2.5 the i.bufferram is total page cache size, we need adjust it */ /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); nomem = (availmem < amemthresh); local_bh_disable(); /* drop_entry */ spin_lock(&ipvs->dropentry_lock); switch (ipvs->sysctl_drop_entry) { case 0: atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { atomic_set(&ipvs->dropentry, 1); ipvs->sysctl_drop_entry = 2; } else { atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { atomic_set(&ipvs->dropentry, 1); } else { atomic_set(&ipvs->dropentry, 0); ipvs->sysctl_drop_entry = 1; } break; case 3: atomic_set(&ipvs->dropentry, 1); break; } spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ spin_lock(&ipvs->droppacket_lock); switch (ipvs->sysctl_drop_packet) { case 0: ipvs->drop_rate = 0; break; case 1: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; } break; case 2: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; } break; case 3: ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: if (ipvs->old_secure_tcp < 2) to_change = 1; break; } ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } /* Handler for delayed work for expiring no * destination connections */ static void expire_nodest_conn_handler(struct work_struct *work) { struct netns_ipvs *ipvs; ipvs = container_of(work, struct netns_ipvs, expire_nodest_conn_work.work); ip_vs_expire_nodest_conn_flush(ipvs); } /* * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ static void defense_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, defense_work.work); update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif static void est_reload_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, est_reload_work.work); int genid_done = atomic_read(&ipvs->est_genid_done); unsigned long delay = HZ / 10; /* repeat startups after failure */ bool repeat = false; int genid; int id; mutex_lock(&ipvs->est_mutex); genid = atomic_read(&ipvs->est_genid); for (id = 0; id < ipvs->est_kt_count; id++) { struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } atomic_set(&ipvs->est_genid_done, genid); if (repeat) queue_delayed_work(system_long_wq, &ipvs->est_reload_work, delay); unlock: mutex_unlock(&ipvs->est_mutex); } int ip_vs_use_count_inc(void) { return try_module_get(THIS_MODULE); } void ip_vs_use_count_dec(void) { module_put(THIS_MODULE); } /* * Returns hash value for virtual service */ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* * Hashes a service in the svc_table by <netns,proto,addr,port> * or by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* * Hash it by <netns,protocol,addr,port> */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); } else { /* * Hash it by fwmark */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); } hlist_add_head_rcu(&svc->s_list, &svc->ipvs->svc_table[hash]); svc->flags |= IP_VS_SVC_F_HASHED; /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); return 1; } /* * Unhashes a service from svc_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } /* Remove it from svc_table */ hlist_del_rcu(&svc->s_list); svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); return 1; } /* * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { if (svc->af == af && ip_vs_addr_equal(af, &svc->addr, vaddr) && svc->port == vport && svc->protocol == protocol && !svc->fwmark) { /* HIT */ return svc; } } return NULL; } /* * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ipvs->svc_table[hash], s_list) { if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ return svc; } } return NULL; } /* Find service, called under RCU lock */ struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc = NULL; int af_id = ip_vs_af_index(af); /* * Check the table hashed by fwmark first */ if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } if (!atomic_read(&ipvs->nonfwm_services[af_id])) goto out; /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (svc) goto out; if (protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter[af_id]) && (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); if (svc) goto out; } if (atomic_read(&ipvs->nullsvc_counter[af_id])) { /* * Check if the catch-all port (port zero) exists */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), svc ? "hit" : "not hit"); return svc; } static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); rcu_assign_pointer(dest->svc, svc); } static void ip_vs_service_free(struct ip_vs_service *svc) { ip_vs_stats_release(&svc->stats); kfree(svc); } static void ip_vs_service_rcu_free(struct rcu_head *head) { struct ip_vs_service *svc; svc = container_of(head, struct ip_vs_service, rcu_head); ip_vs_service_free(svc); } static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } /* * Returns hash value for real service */ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; __be16 port; if (dest->in_rs_table) return; switch (IP_VS_DFWD_METHOD(dest)) { case IP_VS_CONN_F_MASQ: port = dest->port; break; case IP_VS_CONN_F_TUNNEL: switch (dest->tun_type) { case IP_VS_CONN_F_TUNNEL_TYPE_GUE: port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: return; } break; default: return; } /* * Hash by proto,addr,port, * which are the parameters of the real service. */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; } /* Unhash ip_vs_dest from rs_table. */ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ if (dest->in_rs_table) { hlist_del_rcu(&dest->d_list); dest->in_rs_table = 0; } } /* Check if real service by <proto,addr,port> is present */ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } } return false; } /* Find real service record by <proto,addr,port>. * In case of multiple records with the same <proto,addr,port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return dest; } } return NULL; } /* Find real service record by <af,addr,tun_port>. * In case of multiple records with the same <af,addr,tun_port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port) { struct ip_vs_dest *dest; unsigned int hash; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, tun_port); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->tun_port == tun_port && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } } return NULL; } /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; /* * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == dest_af) && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && (dest->port == dport)) { /* HIT */ return dest; } } return NULL; } /* * Find destination by {daddr,dport,vaddr,protocol} * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; __be16 port = dport; svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } void ip_vs_dest_dst_rcu_free(struct rcu_head *head) { struct ip_vs_dest_dst *dest_dst = container_of(head, struct ip_vs_dest_dst, rcu_head); dst_release(dest_dst->dst_cache); kfree(dest_dst); } /* Release dest_dst and dst_cache for dest in user context */ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) { struct ip_vs_dest_dst *old; old = rcu_dereference_protected(dest->dest_dst, 1); if (old) { RCU_INIT_POINTER(dest->dest_dst, NULL); call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } } /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed * from the service table but are still referenced by some conn entries. * The reason to add the destination trash is when the dest is temporary * down (either by administrator or by monitor program), the dest can be * picked back from the trash, the remaining connections to the dest can * continue, and the counting information of the dest is also useful for * scheduling. */ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash */ spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (dest->af == dest_af && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ list_del(&dest->t_list); goto out; } } dest = NULL; out: spin_unlock_bh(&ipvs->dest_trash_lock); return dest; } static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; dest = container_of(head, struct ip_vs_dest, rcu_head); ip_vs_stats_release(&dest->stats); ip_vs_dest_put_and_free(dest); } static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_svc_put(svc); call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* * Clean up all the destinations in the trash * Called by the ip_vs_control_cleanup() * * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; timer_delete_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); ip_vs_dest_free(dest); } } static void ip_vs_stats_rcu_free(struct rcu_head *head) { struct ip_vs_stats_rcu *rs = container_of(head, struct ip_vs_stats_rcu, rcu_head); ip_vs_stats_release(&rs->s); kfree(rs); } static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); IP_VS_SHOW_STATS_COUNTER(outpkts); IP_VS_SHOW_STATS_COUNTER(inbytes); IP_VS_SHOW_STATS_COUNTER(outbytes); ip_vs_read_estimator(dst, src); spin_unlock(&src->lock); } static void ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) { dst->conns = (u32)src->conns; dst->inpkts = (u32)src->inpkts; dst->outpkts = (u32)src->outpkts; dst->inbytes = src->inbytes; dst->outbytes = src->outbytes; dst->cps = (u32)src->cps; dst->inpps = (u32)src->inpps; dst->outpps = (u32)src->outpps; dst->inbps = (u32)src->inbps; dst->outbps = (u32)src->outbps; } static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); IP_VS_ZERO_STATS_COUNTER(outpkts); IP_VS_ZERO_STATS_COUNTER(inbytes); IP_VS_ZERO_STATS_COUNTER(outbytes); ip_vs_zero_estimator(stats); spin_unlock(&stats->lock); } /* Allocate fields after kzalloc */ int ip_vs_stats_init_alloc(struct ip_vs_stats *s) { int i; spin_lock_init(&s->lock); s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); if (!s->cpustats) return -ENOMEM; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); u64_stats_init(&cs->syncp); } return 0; } struct ip_vs_stats *ip_vs_stats_alloc(void) { struct ip_vs_stats *s = kzalloc_obj(*s); if (s && ip_vs_stats_init_alloc(s) >= 0) return s; kfree(s); return NULL; } void ip_vs_stats_release(struct ip_vs_stats *stats) { free_percpu(stats->cpustats); } void ip_vs_stats_free(struct ip_vs_stats *stats) { if (stats) { ip_vs_stats_release(stats); kfree(stats); } } /* * Update a destination in the given service */ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; /* We cannot modify an address and change the address family */ BUG_ON(!add && udest->af != dest->af); if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; /* keep the last_weight with latest non-0 weight */ if (add || udest->weight != 0) atomic_set(&dest->last_weight, udest->weight); /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; /* Need to rehash? */ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_DFWD_METHOD(dest) || udest->tun_type != dest->tun_type || udest->tun_port != dest->tun_port) ip_vs_rs_unhash(dest); /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); /* Put the real service in rs_table if not present. */ ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); __ip_vs_svc_put(old_svc); } } /* set the dest status flags */ dest->flags |= IP_VS_DEST_F_AVAILABLE; if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; dest->af = udest->af; if (add) { list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } /* * Create a destination for the given service */ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; unsigned int atype; int ret; #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; ret = nf_defrag_ipv6_enable(svc->ipvs->net); if (ret) return ret; } else #endif { atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } dest = kzalloc_obj(struct ip_vs_dest); if (dest == NULL) return -ENOMEM; ret = ip_vs_stats_init_alloc(&dest->stats); if (ret < 0) goto err_alloc; ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); refcount_set(&dest->refcnt, 1); INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); __ip_vs_update_dest(svc, dest, udest, 1); return 0; err_stats: ip_vs_stats_release(&dest->stats); err_alloc: kfree(dest); return ret; } /* * Add a destination into an existing service */ static int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; int ret; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); return -EEXIST; } /* * Check if the dest already exists in the trash and * is from the same service */ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), refcount_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) return ret; __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } return ret; } /* * Edit a destination in the given service */ static int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); return -ENOENT; } __ip_vs_update_dest(svc, dest, udest, 0); return 0; } /* * Delete a destination (must be already unlinked from the service) */ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. */ ip_vs_rs_unhash(dest); spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); /* dest lives in trash with reference */ list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. */ if (!cleanup) ip_vs_enqueue_expire_nodest_conns(ipvs); } /* * Unlink a destination from the given service */ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, int svcupd) { dest->flags &= ~IP_VS_DEST_F_AVAILABLE; spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); /* * Remove it from the d-linked destination list. */ list_del_rcu(&dest->n_list); svc->num_dests--; if (dest->af != svc->af) svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->del_dest) sched->del_dest(svc, dest); } } /* * Delete a destination server in the given service */ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; __be16 dport = udest->port; /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); /* * Delete the destination */ __ip_vs_del_dest(svc->ipvs, dest, false); return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { struct netns_ipvs *ipvs = timer_container_of(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { if (refcount_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + IP_VS_DEST_TRASH_PERIOD)) continue; } else { dest->idle_start = max(1UL, now); continue; } IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); list_del(&dest->t_list); ip_vs_dest_free(dest); } if (!list_empty(&ipvs->dest_trash)) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); spin_unlock(&ipvs->dest_trash_lock); } /* * Add a service into the service hash table */ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; int af_id = ip_vs_af_index(u->af); struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; goto out_err; } } if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out_err; } } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out_err; } ret = nf_defrag_ipv6_enable(ipvs->net); if (ret) goto out_err; } #endif if (!atomic_read(&ipvs->num_services[af_id])) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) goto out_err; ret_hooks = ret; } svc = kzalloc_obj(struct ip_vs_service); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } ret = ip_vs_stats_init_alloc(&svc->stats); if (ret < 0) goto out_err; /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); svc->af = u->af; svc->protocol = u->protocol; ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); /* Bind the scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) goto out_err; sched = NULL; } ret = ip_vs_start_estimator(ipvs, &svc->stats); if (ret < 0) goto out_err; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter[af_id]); else if (!svc->port && !svc->fwmark) atomic_inc(&ipvs->nullsvc_counter[af_id]); if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter[af_id]); /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; if (svc->fwmark) atomic_inc(&ipvs->fwm_services[af_id]); else atomic_inc(&ipvs->nonfwm_services[af_id]); atomic_inc(&ipvs->num_services[af_id]); /* Hash the service into the service table */ ip_vs_svc_hash(svc); *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); } return 0; out_err: if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); return ret; } /* * Edit a service and bind it with a new scheduler */ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; bool new_pe_conn_out, old_pe_conn_out; struct netns_ipvs *ipvs = svc->ipvs; int af_id = ip_vs_af_index(svc->af); /* * Lookup the scheduler, by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); return -ENOENT; } } old_sched = sched; if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out; } old_pe = pe; } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out; } } #endif old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { if (old_sched) { ip_vs_unbind_scheduler(svc, old_sched); RCU_INIT_POINTER(svc->scheduler, NULL); /* Wait all svc->sched_data users */ synchronize_rcu(); } /* Bind the new scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) { ip_vs_scheduler_put(sched); goto out; } } } /* * Set the flags and timeout value */ svc->flags = u->flags | IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); /* check for optional methods in new pe */ new_pe_conn_out = (pe && pe->conn_out) ? true : false; old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; if (new_pe_conn_out && !old_pe_conn_out) atomic_inc(&ipvs->conn_out_counter[af_id]); if (old_pe_conn_out && !new_pe_conn_out) atomic_dec(&ipvs->conn_out_counter[af_id]); } out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; int af_id = ip_vs_af_index(svc->af); atomic_dec(&ipvs->num_services[af_id]); if (!atomic_read(&ipvs->num_services[af_id])) ip_vs_unregister_hooks(ipvs, svc->af); if (svc->fwmark) atomic_dec(&ipvs->fwm_services[af_id]); else atomic_dec(&ipvs->nonfwm_services[af_id]); ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); if (old_pe && old_pe->conn_out) atomic_dec(&ipvs->conn_out_counter[af_id]); ip_vs_pe_put(old_pe); /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) atomic_dec(&ipvs->ftpsvc_counter[af_id]); else if (!svc->port && !svc->fwmark) atomic_dec(&ipvs->nullsvc_counter[af_id]); /* * Free the service if nobody refers to it */ __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); } /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ ip_vs_svc_unhash(svc); __ip_vs_del_service(svc, cleanup); } /* * Delete a service from the service list */ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; ip_vs_unlink_service(svc, false); return 0; } /* * Flush all the virtual services */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; struct hlist_node *n; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ipvs->svc_table[idx], s_list) ip_vs_unlink_service(svc, cleanup); } return 0; } /* * Delete service by {netns} in the service table. * Called by __ip_vs_batch_cleanup() */ void ip_vs_service_nets_cleanup(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; /* Check for "full" addressed entries */ list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); mutex_lock(&ipvs->service_mutex); ip_vs_flush(ipvs, true); mutex_unlock(&ipvs->service_mutex); } } /* Put all references for device (dst_cache) */ static inline void ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { struct ip_vs_dest_dst *dest_dst; spin_lock_bh(&dest->dst_lock); dest_dst = rcu_dereference_protected(dest->dest_dst, 1); if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } /* Netdev event receiver * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); rcu_read_lock(); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) list_for_each_entry_rcu(dest, &svc->destinations, n_list) ip_vs_forget_dev(dest, dev); } rcu_read_unlock(); return NOTIFY_DONE; } /* * Zero counters in a service or all services */ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); return 0; } static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) ip_vs_zero_service(svc); } ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } #ifdef CONFIG_SYSCTL static int proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 0 || val > 3) { rc = -EINVAL; } else { *valp = val; update_defense_level(ipvs); } } return rc; } static int proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; struct ctl_table tmp = { .data = &val, .maxlen = table->maxlen, .mode = table->mode, }; mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (val[0] < 0 || val[1] < 0 || (val[0] >= val[1] && val[1])) rc = -EINVAL; else memcpy(valp, val, sizeof(val)); } mutex_unlock(&ipvs->sync_mutex); return rc; } static int proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 1 || !is_power_of_2(val)) rc = -EINVAL; else *valp = val; } return rc; } static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; cpumask_var_t newmask; int ret; if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) return -ENOMEM; ret = cpulist_parse(buffer, newmask); if (ret) goto out; mutex_lock(&ipvs->est_mutex); if (!ipvs->est_cpulist_valid) { if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { ret = -ENOMEM; goto unlock; } ipvs->est_cpulist_valid = 1; } cpumask_and(newmask, newmask, &current->cpus_mask); cpumask_copy(*valp, newmask); /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; ip_vs_est_reload_start(ipvs); unlock: mutex_unlock(&ipvs->est_mutex); out: free_cpumask_var(newmask); return ret; } static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; struct cpumask *mask; int ret; mutex_lock(&ipvs->est_mutex); if (ipvs->est_cpulist_valid) mask = *valp; else mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); mutex_unlock(&ipvs->est_mutex); return ret; } static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* Ignore both read and write(append) if *ppos not 0 */ if (*ppos || !*lenp) { *lenp = 0; return 0; } if (write) { /* proc_sys_call_handler() appends terminator */ ret = ipvs_proc_est_cpumask_set(table, buffer); if (ret >= 0) *ppos += *lenp; } else { /* proc_sys_call_handler() allocates 1 byte for terminator */ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); if (ret >= 0) { *lenp = ret; *ppos += *lenp; ret = 0; } } return ret; } static int ipvs_proc_est_nice(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { if (val < MIN_NICE || val > MAX_NICE) { ret = -EINVAL; } else { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } } return ret; } static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } return ret; } /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "am_droprate", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, #ifdef CONFIG_IP_VS_NFCT { .procname = "conntrack", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { .procname = "secure_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "snat_reroute", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_sync_ports, }, { .procname = "sync_persist_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_qlen_max", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_nodest_conn", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_sctp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_threshold", .maxlen = sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), .mode = 0644, .proc_handler = proc_do_sync_threshold, }, { .procname = "sync_refresh_period", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "sync_retries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "pmtu_disc", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "backup_only", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "conn_reuse_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "schedule_icmp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ignore_tunneled", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_run_estimation, }, { .procname = "est_cpulist", .maxlen = NR_CPUS, /* unused */ .mode = 0644, .proc_handler = ipvs_proc_est_cpulist, }, { .procname = "est_nice", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", .data = &sysctl_ip_vs_debug_level, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ int bucket; }; /* * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: return "Local"; case IP_VS_CONN_F_TUNNEL: return "Tunnel"; case IP_VS_CONN_F_DROUTE: return "Route"; default: return "Masq"; } } static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ipvs->svc_table[idx], s_list) { if (pos-- == 0) { iter->bucket = idx; return svc; } } } return NULL; } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) return ip_vs_info_array(seq,0); svc = v; iter = seq->private; e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ipvs->svc_table[iter->bucket], s_list) { return svc; } } return NULL; } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { const struct ip_vs_service *svc = v; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; if (!svc->fwmark) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", svc->timeout, ntohl(svc->netmask)); else seq_putc(seq, '\n'); list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, " -> [%pI6]:%04X" " %-7s %-6d %-10d %-10d\n", &dest->addr.in6, ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); else #endif seq_printf(seq, " -> %08X:%04X " "%-7s %-6d %-10d %-10d\n", ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); } } return 0; } static const struct seq_operations ip_vs_info_seq_ops = { .start = ip_vs_info_seq_start, .next = ip_vs_info_seq_next, .stop = ip_vs_info_seq_stop, .show = ip_vs_info_seq_show, }; static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, (unsigned long long)show.outpkts, (unsigned long long)show.inbytes, (unsigned long long)show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", (unsigned long long)show.cps, (unsigned long long)show.inpps, (unsigned long long)show.outpps, (unsigned long long)show.inbps, (unsigned long long)show.outbps); return 0; } static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin(&u->syncp); conns = u64_stats_read(&u->cnt.conns); inpkts = u64_stats_read(&u->cnt.inpkts); outpkts = u64_stats_read(&u->cnt.outpkts); inbytes = u64_stats_read(&u->cnt.inbytes); outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, (u64)outpkts, (u64)inbytes, (u64)outbytes); } ip_vs_copy_stats(&kstats, tot_stats); seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)kstats.conns, (unsigned long long)kstats.inpkts, (unsigned long long)kstats.outpkts, (unsigned long long)kstats.inbytes, (unsigned long long)kstats.outbytes); /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", kstats.cps, kstats.inpps, kstats.outpps, kstats.inbps, kstats.outbps); return 0; } #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, u->udp_timeout); #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { return -EINVAL; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) return -EINVAL; #endif #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif return 0; } #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) struct ip_vs_svcdest_user { struct ip_vs_service_user s; struct ip_vs_dest_user d; }; static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), }; union ip_vs_set_arglen { struct ip_vs_service_user field_IP_VS_SO_SET_ADD; struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; struct ip_vs_service_user field_IP_VS_SO_SET_DEL; struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; }; #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { memset(usvc, 0, sizeof(*usvc)); usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; usvc->port = usvc_compat->port; usvc->fwmark = usvc_compat->fwmark; /* Deep copy of sched_name is not needed here */ usvc->sched_name = usvc_compat->sched_name; usvc->flags = usvc_compat->flags; usvc->timeout = usvc_compat->timeout; usvc->netmask = usvc_compat->netmask; } static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { memset(udest, 0, sizeof(*udest)); udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; if (len != set_arglen[CMDID(cmd)]) { IP_VS_DBG(1, "set_ctl: len %u != %u\n", len, set_arglen[CMDID(cmd)]); return -EINVAL; } if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ipvs_sync_daemon_cfg cfg; memset(&cfg, 0, sizeof(cfg)); ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } return ret; } mutex_lock(&ipvs->service_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (!len) { /* No more commands with len == 0 below */ ret = -EINVAL; goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); /* We only use the new structs internally, so copy userspace compat * structs to extended internal versions */ ip_vs_copy_usvc_compat(&usvc, usvc_compat); ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(ipvs); goto out_unlock; } } if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == IP_VS_SCHEDNAME_MAXLEN) { ret = -EINVAL; goto out_unlock; } /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && usvc.protocol != IPPROTO_SCTP) { pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", usvc.protocol, &usvc.addr.ip, ntohs(usvc.port)); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by <protocol, addr, port> or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } switch (cmd) { case IP_VS_SO_SET_ADD: if (svc != NULL) ret = -EEXIST; else ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); if (!ret) goto out_unlock; break; case IP_VS_SO_SET_ZERO: ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } out_unlock: mutex_unlock(&ipvs->service_mutex); return ret; } static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; ip_vs_copy_stats(&kstats, &src->stats); ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; struct ip_vs_service *svc; struct ip_vs_service_entry entry; int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ipvs->svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } out: return ret; } static inline int __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; /* Cannot expose heterogeneous members via sockopt * interface */ if (dest->af != svc->af) continue; entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); entry.u_threshold = dest->u_threshold; entry.l_threshold = dest->l_threshold; entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); ip_vs_copy_stats(&kstats, &dest->stats); ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; break; } count++; } } else ret = -ESRCH; return ret; } static inline void __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { [CMDID(IP_VS_SO_GET_VERSION)] = 64, [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), }; union ip_vs_get_arglen { char field_IP_VS_SO_GET_VERSION[64]; struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; copylen = get_arglen[CMDID(cmd)]; if (*len < (int) copylen) { IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* * Handle daemons first since it has its own locking */ if (cmd == IP_VS_SO_GET_DAEMON) { struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; mutex_unlock(&ipvs->sync_mutex); return ret; } mutex_lock(&ipvs->service_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; } *len = strlen(buf)+1; } break; case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; info.num_services = atomic_read(&ipvs->num_services[IP_VS_AF_INET]); if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } break; case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_service_entries(ipvs, get, user); } break; case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; } else ret = -ESRCH; } break; case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; case IP_VS_SO_GET_TIMEOUT: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } break; default: ret = -EINVAL; } out: mutex_unlock(&ipvs->service_mutex); return ret; } static struct nf_sockopt_ops ip_vs_sockopts = { .pf = PF_INET, .set_optmin = IP_VS_BASE_CTL, .set_optmax = IP_VS_SO_SET_MAX+1, .set = do_ip_vs_set_ctl, .get_optmin = IP_VS_BASE_CTL, .get_optmax = IP_VS_SO_GET_MAX+1, .get = do_ip_vs_get_ctl, .owner = THIS_MODULE, }; /* * Generic Netlink interface */ /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family; /* Policy used for first-level command attributes */ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN - 1 }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { struct ip_vs_scheduler *sched; struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; char *sched_name; nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) return -EMSGSIZE; if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) goto nla_put_failure; if (svc->fwmark) { if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) goto nla_put_failure; } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } sched = rcu_dereference(svc->scheduler); sched_name = sched ? sched->name : "none"; pe = rcu_dereference(svc->pe); if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &svc->stats); if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); return 0; nla_put_failure: nla_nest_cancel(skb, nl_service); return -EMSGSIZE; } static int ip_vs_genl_dump_service(struct sk_buff *skb, struct ip_vs_service *svc, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_service(skb, svc) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); rcu_read_lock(); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry_rcu(svc, &ipvs->svc_table[i], s_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } nla_put_failure: rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static bool ip_vs_is_af_valid(int af) { if (af == AF_INET) return true; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6 && ipv6_mod_enabled()) return true; #endif return false; } static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, bool full_entry, struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; nla_port = attrs[IPVS_SVC_ATTR_PORT]; nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { usvc->protocol = IPPROTO_TCP; usvc->fwmark = nla_get_u32(nla_fwmark); } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) return -EINVAL; nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ if (svc) usvc->flags = svc->flags; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_be32(nla_netmask); } return 0; } static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; struct ip_vs_kstats kstats; nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, atomic_read(&dest->activeconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &dest->stats); if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); return 0; nla_put_failure: nla_nest_cancel(skb, nl_dest); return -EMSGSIZE; } static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_dest(skb, dest) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0; int start = cb->args[0]; struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); rcu_read_lock(); /* Try to find the service for which to dump destinations */ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) goto out_err; svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR_OR_NULL(svc)) goto out_err; /* Dump the destinations */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { idx--; goto nla_put_failure; } } nla_put_failure: cb->args[0] = idx; out_err: rcu_read_unlock(); return skb->len; } static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, bool full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, *nla_l_thresh, *nla_tun_type, *nla_tun_port, *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; udest->conn_flags = nla_get_u32(nla_fwd) & IP_VS_CONN_F_FWD_MASK; udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); if (nla_tun_type) udest->tun_type = nla_get_u8(nla_tun_type); if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); if (nla_tun_flags) udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); if (!nl_daemon) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; #ifdef CONFIG_IP_VS_IPV6 if (c->mcast_af == AF_INET6) { if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, &c->mcast_group.in6)) goto nla_put_failure; } else #endif if (c->mcast_af == AF_INET && nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, c->mcast_group.ip)) goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; nla_put_failure: nla_nest_cancel(skb, nl_daemon); return -EMSGSIZE; } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; } nla_put_failure: mutex_unlock(&ipvs->sync_mutex); return skb->len; } static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ipvs_sync_daemon_cfg c; struct nlattr *a; int ret; memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; if (a) c.sync_maxlen = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; if (a) { c.mcast_af = AF_INET; c.mcast_group.ip = nla_get_in_addr(a); if (!ipv4_is_multicast(c.mcast_group.ip)) return -EINVAL; } else { a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; if (a) { #ifdef CONFIG_IP_VS_IPV6 int addr_type; c.mcast_af = AF_INET6; c.mcast_group.in6 = nla_get_in6_addr(a); addr_type = ipv6_addr_type(&c.mcast_group.in6); if (!(addr_type & IPV6_ADDR_MULTICAST)) return -EINVAL; #else return -EAFNOSUPPORT; #endif } } a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; if (a) c.mcast_port = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; if (a) c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ if (ipvs->mixed_address_family_dests > 0) return -EINVAL; ret = start_sync_thread(ipvs, &c, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { int ret; if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) t.tcp_fin_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { int ret = -EINVAL, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } out: return ret; } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { bool need_full_svc = false, need_full_dest = false; struct ip_vs_service *svc = NULL; struct ip_vs_service_user_kern usvc; struct ip_vs_dest_user_kern udest; int ret = 0, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&ipvs->service_mutex); if (cmd == IPVS_CMD_FLUSH) { ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { ret = ip_vs_zero_all(ipvs); goto out; } /* All following commands require a service argument, so check if we * received a valid one. We need a full service specification when * adding / editing a service. Only identifying members otherwise. */ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = true; /* We use function that requires RCU lock (hlist_bl) */ rcu_read_lock(); ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); rcu_read_unlock(); if (ret) goto out; /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; goto out; } /* Destination commands require a valid destination argument. For * adding / editing a destination, we need a full destination * specification. */ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || cmd == IPVS_CMD_DEL_DEST) { if (cmd != IPVS_CMD_DEL_DEST) need_full_dest = true; ret = ip_vs_genl_parse_dest(&udest, info->attrs[IPVS_CMD_ATTR_DEST], need_full_dest); if (ret) goto out; /* Old protocols did not allow the user to specify address * family, so we set it to zero instead. We also didn't * allow heterogeneous pools in the old code, so it's safe * to assume that this will have the same address family as * the service. */ if (udest.af == 0) udest.af = svc->af; if (!ip_vs_is_af_valid(udest.af)) { ret = -EAFNOSUPPORT; goto out; } if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ if (ipvs->sync_state) { ret = -EINVAL; goto out; } /* Which connection types do we support? */ switch (udest.conn_flags) { case IP_VS_CONN_F_TUNNEL: /* We are able to forward this */ break; default: ret = -EINVAL; goto out; } } } switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; case IPVS_CMD_SET_SERVICE: ret = ip_vs_edit_service(svc, &usvc); break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); break; case IPVS_CMD_SET_DEST: ret = ip_vs_edit_dest(svc, &udest); break; case IPVS_CMD_DEL_DEST: ret = ip_vs_del_dest(svc, &udest); break; case IPVS_CMD_ZERO: ret = ip_vs_zero_service(svc); break; default: ret = -EINVAL; } out: mutex_unlock(&ipvs->service_mutex); return ret; } static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) reply_cmd = IPVS_CMD_NEW_SERVICE; else if (cmd == IPVS_CMD_GET_INFO) reply_cmd = IPVS_CMD_SET_INFO; else if (cmd == IPVS_CMD_GET_CONFIG) reply_cmd = IPVS_CMD_SET_CONFIG; else { pr_err("unknown Generic Netlink command\n"); return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); if (reply == NULL) goto nla_put_failure; switch (cmd) { case IPVS_CMD_GET_SERVICE: { struct ip_vs_service *svc; svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); if (ret) goto nla_put_failure; } else { ret = -ESRCH; goto out_err; } break; } case IPVS_CMD_GET_CONFIG: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, t.tcp_fin_timeout)) goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, ip_vs_conn_tab_size)) goto nla_put_failure; break; } genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); goto out; nla_put_failure: pr_err("not enough space in Netlink message\n"); ret = -EMSGSIZE; out_err: nlmsg_free(msg); out: rcu_read_unlock(); return ret; } static const struct genl_small_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, .dumpit = ip_vs_genl_dump_services, }, { .cmd = IPVS_CMD_NEW_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_dests, }, { .cmd = IPVS_CMD_NEW_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_daemons, }, { .cmd = IPVS_CMD_SET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_GET_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_ZERO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, }; static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_ATTR_MAX, .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .small_ops = ip_vs_genl_ops, .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), .resv_start_op = IPVS_CMD_FLUSH + 1, .parallel_ops = 1, }; static int __init ip_vs_genl_register(void) { return genl_register_family(&ip_vs_genl_family); } static void ip_vs_genl_unregister(void) { genl_unregister_family(&ip_vs_genl_family); } /* End of Generic Netlink interface definitions */ /* * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; } else tbl = vs_vars; /* Initialize sysctl defaults */ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { if (tbl[idx].proc_handler == proc_do_defense_mode) tbl[idx].extra2 = ipvs; } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; ipvs->sysctl_am_droprate = 10; tbl[idx++].data = &ipvs->sysctl_am_droprate; tbl[idx++].data = &ipvs->sysctl_drop_entry; tbl[idx++].data = &ipvs->sysctl_drop_packet; #ifdef CONFIG_IP_VS_NFCT tbl[idx++].data = &ipvs->sysctl_conntrack; #endif tbl[idx++].data = &ipvs->sysctl_secure_tcp; ipvs->sysctl_snat_reroute = 1; tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; ipvs->sysctl_sync_sock_size = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); if (ret < 0) goto err; /* Schedule defense work */ queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); return 0; err: unregister_net_sysctl_table(ipvs->sysctl_hdr); if (!net_eq(net, &init_net)) kfree(tbl); return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); } #else static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, #ifdef CONFIG_IP_VS_IPV6 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, #endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { int ret = -ENOMEM; int idx; /* Initialize service_mutex, svc_table per netns */ __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->svc_table[idx]); /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); for (idx = 0; idx < IP_VS_AF_MAX; idx++) { atomic_set(&ipvs->num_services[idx], 0); atomic_set(&ipvs->fwm_services[idx], 0); atomic_set(&ipvs->nonfwm_services[idx], 0); atomic_set(&ipvs->ftpsvc_counter[idx], 0); atomic_set(&ipvs->nullsvc_counter[idx], 0); atomic_set(&ipvs->conn_out_counter[idx], 0); } INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); /* procfs stats */ ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); if (!ipvs->tot_stats) goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) goto err_vs; if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, ip_vs_stats_show, NULL)) goto err_stats; if (!proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; #endif ret = ip_vs_control_net_init_sysctl(ipvs); if (ret < 0) goto err; return 0; err: #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); err_stats: remove_proc_entry("ip_vs", ipvs->net->proc_net); err_vs: #endif ip_vs_stats_release(&ipvs->tot_stats->s); err_tot_stats: kfree(ipvs->tot_stats); out: return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) { int ret; ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } return 0; err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } void ip_vs_unregister_nl_ioctl(void) { ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); } int __init ip_vs_control_init(void) { int ret; ret = register_netdevice_notifier(&ip_vs_dst_notifier); if (ret < 0) return ret; return 0; } void ip_vs_control_cleanup(void) { unregister_netdevice_notifier(&ip_vs_dst_notifier); /* relying on common rcu_barrier() in ip_vs_cleanup() */ }
2002 3 1999 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 #ifndef __ASSEMBLER__ /* * A clear pte value is special, and doesn't get inverted. * * Note that even users that only pass a pgprot_t (rather * than a full pte) won't trigger the special zero case, * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED * set. So the all zero case really is limited to just the * cleared page table entry case. */ static inline bool __pte_needs_invert(u64 val) { return val && !(val & _PAGE_PRESENT); } /* Get a mask to xor with the page table entry to get the correct pfn. */ static inline u64 protnone_mask(u64 val) { return __pte_needs_invert(val) ? ~0ull : 0; } static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) { /* * When a PTE transitions from NONE to !NONE or vice-versa * invert the PFN part to stop speculation. * pte_pfn undoes this when needed. */ if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) val = (val & ~mask) | (~val & mask); return val; } #endif /* __ASSEMBLER__ */ #endif
25 2 17 13 25 4 24 25 12 12 13 4 13 12 13 25 12 25 12 25 25 12 12 25 25 14 12 12 25 9 9 9 9 9 9 9 9 9 9 24 23 25 9 24 17 5 12 4 12 16 17 5 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { if (node->back_pointer || assoc_array_ptr_is_shortcut(ptr)) goto descend_to_node; } if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc_obj(struct key_restriction); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); }
6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to sysfs handling */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*show_limit)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); int (*store_limit)(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim); }; static ssize_t queue_var_show(unsigned long var, char *page) { return sysfs_emit(page, "%lu\n", var); } static ssize_t queue_var_store(unsigned long *var, const char *page, size_t count) { int err; unsigned long v; err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; *var = v; return count; } static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); mutex_unlock(&disk->queue->elevator_lock); return ret; } static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; struct blk_mq_tag_set *set = q->tag_set; struct elevator_tags *et = NULL; unsigned int memflags; unsigned long nr; int ret; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; /* * Serialize updating nr_requests with concurrent queue_requests_store() * and switching elevator. */ down_write(&set->update_nr_hwq_lock); if (nr == q->nr_requests) goto unlock; if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; /* * Switching elevator is protected by update_nr_hwq_lock: * - read lock is held from elevator sysfs attribute; * - write lock is held from updating nr_hw_queues; * Hence it's safe to access q->elevator here with write lock held. */ if (nr <= set->reserved_tags || (q->elevator && nr > MAX_SCHED_RQ) || (!q->elevator && nr > set->queue_depth)) { ret = -EINVAL; goto unlock; } if (!blk_mq_is_shared_tags(set->flags) && q->elevator && nr > q->elevator->et->nr_requests) { /* * Tags will grow, allocate memory before freezing queue to * prevent deadlock. */ et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); if (!et) { ret = -ENOMEM; goto unlock; } } memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); et = blk_mq_update_nr_requests(q, et, nr); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (et) blk_mq_free_sched_tags(et, set); unlock: up_write(&set->update_nr_hwq_lock); return ret; } static ssize_t queue_async_depth_show(struct gendisk *disk, char *page) { guard(mutex)(&disk->queue->elevator_lock); return queue_var_show(disk->queue->async_depth, page); } static ssize_t queue_async_depth_store(struct gendisk *disk, const char *page, size_t count) { struct request_queue *q = disk->queue; unsigned int memflags; unsigned long nr; int ret; if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; if (nr == 0) return -EINVAL; memflags = blk_mq_freeze_queue(q); scoped_guard(mutex, &q->elevator_lock) { if (q->elevator) { q->async_depth = min(q->nr_requests, nr); if (q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); } else { ret = -EINVAL; } } blk_mq_unfreeze_queue(q, memflags); return ret; } static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); mutex_unlock(&disk->queue->limits_lock); return ret; } static ssize_t queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* * The ->ra_pages change below is protected by ->limits_lock because it * is usually calculated from the queue limits by * queue_limits_commit_update(). * * bdi->ra_pages reads are not serialized against bdi->ra_pages writes. * Use WRITE_ONCE() to write bdi->ra_pages once. */ mutex_lock(&q->limits_lock); WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10)); mutex_unlock(&q->limits_lock); return ret; } #define QUEUE_SYSFS_LIMIT_SHOW(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field, page); \ } QUEUE_SYSFS_LIMIT_SHOW(max_segments) QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) QUEUE_SYSFS_LIMIT_SHOW(io_min) QUEUE_SYSFS_LIMIT_SHOW(io_opt) QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%llu\n", \ (unsigned long long)disk->queue->limits._field << \ SECTOR_SHIFT); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) #define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ { \ return queue_var_show(disk->queue->limits._field >> 1, page); \ } QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) #define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%d\n", _val); \ } /* deprecated fields */ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) static int queue_max_discard_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; ssize_t ret; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_zeroes_bytes, max_hw_zeroes_bytes; ssize_t ret; ret = queue_var_store(&max_zeroes_bytes, page, count); if (ret < 0) return ret; max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT; if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes) return -EINVAL; lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT; return 0; } static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long max_sectors_kb; ssize_t ret; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; lim->max_user_sectors = max_sectors_kb << 1; return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim, blk_features_t feature) { unsigned long val; ssize_t ret; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; if (val) lim->features |= feature; else lim->features &= ~feature; return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ static int queue_##_name##_store(struct gendisk *disk, \ const char *page, size_t count, struct queue_limits *lim) \ { \ return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); #define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); static ssize_t queue_poll_show(struct gendisk *disk, char *page) { if (queue_is_mq(disk->queue)) return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); return sysfs_emit(page, "%u\n", !!(disk->queue->limits.features & BLK_FEAT_POLL)); } static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) return sysfs_emit(page, "host-managed\n"); return sysfs_emit(page, "none\n"); } static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { return queue_var_show(disk_nr_zones(disk), page); } static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } static int queue_iostats_passthrough_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { unsigned long ios; ssize_t ret; ret = queue_var_store(&ios, page, count); if (ret < 0) return ret; if (ios) lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; return 0; } static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | blk_queue_noxmerges(disk->queue), page); } static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); return ret; } static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; /* * Here we update two queue flags each using atomic bitops, although * updating two flags isn't atomic it should be harmless as those flags * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } #endif return ret; } static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = count; struct request_queue *q = disk->queue; if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; } pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: return ret; } static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { return sysfs_emit(page, "%u\n", jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { unsigned int val; int err; struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; blk_queue_rq_timeout(q, msecs_to_jiffies(val)); return count; } static ssize_t queue_wc_show(struct gendisk *disk, char *page) { if (blk_queue_write_cache(disk->queue)) return sysfs_emit(page, "write back\n"); return sysfs_emit(page, "write through\n"); } static int queue_wc_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) { bool disable; if (!strncmp(page, "write back", 10)) { disable = false; } else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) { disable = true; } else { return -EINVAL; } if (disable) lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show = _prefix##_show, \ }; #define QUEUE_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show = _prefix##_show, \ .store = _prefix##_store, \ }; #define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show_limit = _prefix##_show, \ } #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_async_depth, "async_depth"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, "atomic_write_boundary_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors, "write_zeroes_unmap_max_hw_bytes"); QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors, "write_zeroes_unmap_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show_limit = queue_logical_block_size_show, }; QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) { int err; s64 v; err = kstrtos64(page, 10, &v); if (err < 0) return err; *var = v; return 0; } static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { ssize_t ret; struct request_queue *q = disk->queue; mutex_lock(&disk->rqos_state_mutex); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; } if (wbt_disabled(q)) { ret = sysfs_emit(page, "0\n"); goto out; } ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: mutex_unlock(&disk->rqos_state_mutex); return ret; } static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret; s64 val; ret = queue_var_store64(&val, page); if (ret < 0) return ret; if (val < -1) return -EINVAL; ret = wbt_set_lat(disk, val); return ret ? ret : count; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { /* * Attributes which are protected with q->limits_lock. */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_max_write_streams_entry.attr, &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_max_discard_sectors_entry.attr, &queue_max_hw_discard_sectors_entry.attr, &queue_atomic_write_max_sectors_entry.attr, &queue_atomic_write_boundary_sectors_entry.attr, &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, &queue_max_hw_wzeroes_unmap_sectors_entry.attr, &queue_max_wzeroes_unmap_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, &queue_zoned_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_add_random_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, &queue_ra_entry.attr, /* * Attributes which don't require locking. */ &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nr_zones_entry.attr, &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, &queue_async_depth_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif /* * Attributes which don't require locking. */ &queue_rq_affinity_entry.attr, &queue_io_timeout_entry.attr, NULL, }; static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) return 0; return attr->mode; } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return 0; if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) return 0; return attr->mode; } static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; static struct attribute_group blk_mq_queue_attr_group = { .attrs = blk_mq_queue_attrs, .is_visible = blk_mq_queue_attr_visible, }; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); if (!entry->show && !entry->show_limit) return -EIO; if (entry->show_limit) { ssize_t res; mutex_lock(&disk->queue->limits_lock); res = entry->show_limit(disk, page); mutex_unlock(&disk->queue->limits_lock); return res; } return entry->show(disk, page); } static ssize_t queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; if (!entry->store_limit && !entry->store) return -EIO; if (entry->store_limit) { ssize_t res; struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); if (res < 0) { queue_limits_cancel_update(q); return res; } res = queue_limits_commit_update_frozen(q, &lim); if (res) return res; return length; } return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, &blk_mq_queue_attr_group, NULL }; static void blk_queue_release(struct kobject *kobj) { /* nothing to do here, all data is associated with the parent gendisk */ } const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) { struct request_queue *q = disk->queue; blk_debugfs_lock_nomemsave(q); blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); q->debugfs_dir = NULL; q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; blk_debugfs_unlock_nomemrestore(q); } /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. */ int blk_register_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; unsigned int memflags; int ret; ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) return ret; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) goto out_del_queue_kobj; } mutex_lock(&q->sysfs_lock); memflags = blk_debugfs_lock(q); q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); blk_debugfs_unlock(q, memflags); ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; ret = blk_crypto_sysfs_register(disk); if (ret) goto out_unregister_ia_ranges; if (queue_is_mq(q)) elevator_set_default(q); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_init_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully * functional queue takes measureable wallclock time as RCU grace * periods are involved. To avoid excessive latency in these * cases, a request_queue starts out in a degraded mode which is * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); out_del_queue_kobj: kobject_del(&disk->queue_kobj); return ret; } /** * blk_unregister_queue - counterpart of blk_register_queue() * @disk: Disk of which the request queue should be unregistered from sysfs. * * Note: the caller is responsible for guaranteeing that this function is called * after blk_register_queue() has finished. */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; if (WARN_ON(!q)) return; /* Return early if disk->queue was never registered. */ if (!blk_queue_registered(q)) return; /* * Since sysfs_remove_dir() prevents adding new directory entries * before removal of existing entries starts, protect against * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); if (queue_is_mq(q)) elevator_set_none(q); blk_debugfs_remove(disk); }
3692 4127 4118 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 * x86-64 work by Andi Kleen 2002 */ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H #include <linux/bottom_half.h> #include <asm/fpu/types.h> /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption and softirq processing, so be careful if you intend to * use it for long periods of time. Kernel-mode FPU cannot be used in all * contexts -- see irq_fpu_usable() for details. */ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ #define KFPU_387 _BITUL(0) /* 387 state will be initialized */ #define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); /* Code that is unaware of kernel_fpu_begin_mask() can use this */ static inline void kernel_fpu_begin(void) { #ifdef CONFIG_X86_64 /* * Any 64-bit code that uses 387 instructions must explicitly request * KFPU_387. */ kernel_fpu_begin_mask(KFPU_MXCSR); #else /* * 32-bit kernel code may use 387 operations as well as SSE2, etc, * as long as it checks that the CPU has the required capability. */ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); #endif } /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while * using the FPU in kernel mode. A context switch will (and softirq might) save * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving * CPU's FPU registers in a random state. * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. * * On RT kernels local_bh_disable() is not sufficient because it only * serializes soft interrupt related sections via a local lock, but stays * preemptible. Disabling preemption is the right choice here as bottom * half processing is always in thread context on RT kernels so it * implicitly prevents bottom half processing as well. */ static inline void fpregs_lock(void) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_disable(); else preempt_disable(); } static inline void fpregs_unlock(void) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_enable(); else preempt_enable(); } /* * FPU state gets lazily restored before returning to userspace. So when in the * kernel, the valid FPU state may be kept in the buffer. This function will force * restore all the fpu state to the registers early if needed, and lock them from * being automatically saved/restored. Then FPU state can be modified safely in the * registers, before unlocking with fpregs_unlock(). */ void fpregs_lock_and_load(void); #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else static inline void fpregs_assert_state_consistent(void) { } #endif /* * Load the task FPU state before returning to userspace. */ extern void switch_fpu_return(void); /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of * the feature there as well - this can be used to print error (or success) * messages. */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); /* Trap handling */ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); extern void fpu_reset_from_exception_fixup(void); /* Boot, hotplug and resume */ extern void fpu__init_cpu(void); extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif /* State tracking */ DECLARE_PER_CPU(bool, kernel_fpu_allowed); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* Process cleanup */ #ifdef CONFIG_X86_64 extern void fpstate_free(struct fpu *fpu); #else static inline void fpstate_free(struct fpu *fpu) { } #endif /* fpstate-related functions which are exported to KVM */ extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature); extern u64 xstate_get_guest_group_perm(void); extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures); #ifdef CONFIG_X86_64 extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd); extern void fpu_sync_guest_vmexit_xfd_state(void); #else static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { } static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) { gfpu->fpstate->is_confidential = true; } static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) { return gfpu->fpstate->is_confidential; } /* prctl */ extern long fpu_xstate_prctl(int option, unsigned long arg2); extern void fpu_idle_fpregs(void); #endif /* _ASM_X86_FPU_API_H */
26 26 26 26 48 48 48 26 26 26 15 15 15 13 13 13 11 11 13 26 61 15 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peer.h" #include "device.h" #include "queueing.h" #include "timers.h" #include "peerlookup.h" #include "noise.h" #include <linux/kref.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/list.h> static struct kmem_cache *peer_cache; static atomic64_t peer_counter = ATOMIC64_INIT(0); struct wg_peer *wg_peer_create(struct wg_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) { struct wg_peer *peer; int ret = -ENOMEM; lockdep_assert_held(&wg->device_update_lock); if (wg->num_peers >= MAX_PEERS_PER_DEVICE) return ERR_PTR(ret); peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))) goto err; peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); wg_prev_queue_init(&peer->tx_queue); wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state); netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll); napi_enable(&peer->napi); list_add_tail(&peer->peer_list, &wg->peer_list); INIT_LIST_HEAD(&peer->allowedips_list); wg_pubkey_hashtable_add(wg->peer_hashtable, peer); ++wg->num_peers; pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; err: kmem_cache_free(peer_cache, peer); return ERR_PTR(ret); } struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking peer reference without holding the RCU read lock"); if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) return NULL; return peer; } static void peer_make_dead(struct wg_peer *peer) { /* Remove from configuration-time lookup structures. */ list_del_init(&peer->peer_list); wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer, &peer->device->device_update_lock); wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer); /* Mark as dead, so that we don't allow jumping contexts after. */ WRITE_ONCE(peer->is_dead, true); /* The caller must now synchronize_net() for this to take effect. */ } static void peer_remove_after_dead(struct wg_peer *peer) { WARN_ON(!peer->is_dead); /* No more keypairs can be created for this peer, since is_dead protects * add_new_keypair, so we can now destroy existing ones. */ wg_noise_keypairs_clear(&peer->keypairs); /* Destroy all ongoing timers that were in-flight at the beginning of * this function. */ wg_timers_stop(peer); /* The transition between packet encryption/decryption queues isn't * guarded by is_dead, but each reference's life is strictly bounded by * two generations: once for parallel crypto and once for serial * ingestion, so we can simply flush twice, and be sure that we no * longer have references inside these queues. */ /* a) For encrypt/decrypt. */ flush_workqueue(peer->device->packet_crypt_wq); /* b.1) For send (but not receive, since that's napi). */ flush_workqueue(peer->device->packet_crypt_wq); /* b.2.1) For receive (but not send, since that's wq). */ napi_disable(&peer->napi); /* b.2.1) It's now safe to remove the napi struct, which must be done * here from process context. */ netif_napi_del(&peer->napi); /* Ensure any workstructs we own (like transmit_handshake_work or * clear_peer_work) no longer are in use. */ flush_workqueue(peer->device->handshake_send_wq); /* After the above flushes, a peer might still be active in a few * different contexts: 1) from xmit(), before hitting is_dead and * returning, 2) from wg_packet_consume_data(), before hitting is_dead * and returning, 3) from wg_receive_handshake_packet() after a point * where it has processed an incoming handshake packet, but where * all calls to pass it off to timers fails because of is_dead. We won't * have new references in (1) eventually, because we're removed from * allowedips; we won't have new references in (2) eventually, because * wg_index_hashtable_lookup will always return NULL, since we removed * all existing keypairs and no more can be created; we won't have new * references in (3) eventually, because we're removed from the pubkey * hash table, which allows for a maximum of one handshake response, * via the still-uncleared index hashtable entry, but not more than one, * and in wg_cookie_message_consume, the lookup eventually gets a peer * with a refcount of zero, so no new reference is taken. */ --peer->device->num_peers; wg_peer_put(peer); } /* We have a separate "remove" function make sure that all active places where * a peer is currently operating will eventually come to an end and not pass * their reference onto another context. */ void wg_peer_remove(struct wg_peer *peer) { if (unlikely(!peer)) return; lockdep_assert_held(&peer->device->device_update_lock); peer_make_dead(peer); synchronize_net(); peer_remove_after_dead(peer); } void wg_peer_remove_all(struct wg_device *wg) { struct wg_peer *peer, *temp; LIST_HEAD(dead_peers); lockdep_assert_held(&wg->device_update_lock); /* Avoid having to traverse individually for each one. */ wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { peer_make_dead(peer); list_add_tail(&peer->peer_list, &dead_peers); } synchronize_net(); list_for_each_entry_safe(peer, temp, &dead_peers, peer_list) peer_remove_after_dead(peer); } static void rcu_release(struct rcu_head *rcu) { struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. */ memzero_explicit(peer, sizeof(*peer)); kmem_cache_free(peer_cache, peer); } static void kref_release(struct kref *refcount) { struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount); pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr); /* Remove ourself from dynamic runtime lookup structures, now that the * last reference is gone. */ wg_index_hashtable_remove(peer->device->index_hashtable, &peer->handshake.entry); /* Remove any lingering packets that didn't have a chance to be * transmitted. */ wg_packet_purge_staged_packets(peer); /* Free the memory used. */ call_rcu(&peer->rcu, rcu_release); } void wg_peer_put(struct wg_peer *peer) { if (unlikely(!peer)) return; kref_put(&peer->refcount, kref_release); } int __init wg_peer_init(void) { peer_cache = KMEM_CACHE(wg_peer, 0); return peer_cache ? 0 : -ENOMEM; } void wg_peer_uninit(void) { kmem_cache_destroy(peer_cache); }
12 12 12 12 22 10 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-or-later /* System trusted keyring for trusted public keys * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/uidgid.h> #include <linux/verification.h> #include <keys/asymmetric-type.h> #include <keys/system_keyring.h> #include <crypto/pkcs7.h> static struct key *builtin_trusted_keys; #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING static struct key *secondary_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING static struct key *machine_trusted_keys; #endif #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING static struct key *platform_trusted_keys; #endif extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; extern __initconst const unsigned long module_cert_size; /** * restrict_link_by_builtin_trusted - Restrict keyring addition by built-in CA * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. */ int restrict_link_by_builtin_trusted(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_signature(dest_keyring, type, payload, builtin_trusted_keys); } /** * restrict_link_by_digsig_builtin - Restrict digitalSignature key additions by the built-in keyring * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restriction_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in the built in system keyring. The new key * must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return restrict_link_by_digsig(dest_keyring, type, payload, builtin_trusted_keys); } #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING /** * restrict_link_by_builtin_and_secondary_trusted - Restrict keyring * addition by both built-in and secondary keyrings. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. */ int restrict_link_by_builtin_and_secondary_trusted( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_signature(dest_keyring, type, payload, secondary_trusted_keys); } /** * restrict_link_by_digsig_builtin_and_secondary - Restrict by digitalSignature. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in or the secondary system * keyrings. The new key must have the digitalSignature usage field set. */ int restrict_link_by_digsig_builtin_and_secondary(struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { /* If we have a secondary trusted keyring, then that contains a link * through to the builtin keyring and the search will follow that link. */ if (type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &builtin_trusted_keys->payload) /* Allow the builtin keyring to be added to the secondary */ return 0; return restrict_link_by_digsig(dest_keyring, type, payload, secondary_trusted_keys); } /* * Allocate a struct key_restriction for the "builtin and secondary trust" * keyring. Only for use in system_trusted_keyring_init(). */ static __init struct key_restriction *get_builtin_and_secondary_restriction(void) { struct key_restriction *restriction; restriction = kzalloc_obj(struct key_restriction); if (!restriction) panic("Can't allocate secondary trusted keyring restriction\n"); if (IS_ENABLED(CONFIG_INTEGRITY_MACHINE_KEYRING)) restriction->check = restrict_link_by_builtin_secondary_and_machine; else restriction->check = restrict_link_by_builtin_and_secondary_trusted; return restriction; } /** * add_to_secondary_keyring - Add to secondary keyring. * @source: Source of key * @data: The blob holding the key * @len: The length of the data blob * * Add a key to the secondary keyring. The key must be vouched for by a key in the builtin, * machine or secondary keyring itself. */ void __init add_to_secondary_keyring(const char *source, const void *data, size_t len) { key_ref_t key; key_perm_t perm; perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW; key = key_create_or_update(make_key_ref(secondary_trusted_keys, 1), "asymmetric", NULL, data, len, perm, KEY_ALLOC_NOT_IN_QUOTA); if (IS_ERR(key)) { pr_err("Problem loading X.509 certificate from %s to secondary keyring %ld\n", source, PTR_ERR(key)); return; } pr_notice("Loaded X.509 cert '%s'\n", key_ref_to_ptr(key)->description); key_ref_put(key); } #endif #ifdef CONFIG_INTEGRITY_MACHINE_KEYRING void __init set_machine_trusted_keys(struct key *keyring) { machine_trusted_keys = keyring; if (key_link(secondary_trusted_keys, machine_trusted_keys) < 0) panic("Can't link (machine) trusted keyrings\n"); } /** * restrict_link_by_builtin_secondary_and_machine - Restrict keyring addition. * @dest_keyring: Keyring being linked to. * @type: The type of key being added. * @payload: The payload of the new key. * @restrict_key: A ring of keys that can be used to vouch for the new cert. * * Restrict the addition of keys into a keyring based on the key-to-be-added * being vouched for by a key in either the built-in, the secondary, or * the machine keyrings. */ int restrict_link_by_builtin_secondary_and_machine( struct key *dest_keyring, const struct key_type *type, const union key_payload *payload, struct key *restrict_key) { if (machine_trusted_keys && type == &key_type_keyring && dest_keyring == secondary_trusted_keys && payload == &machine_trusted_keys->payload) /* Allow the machine keyring to be added to the secondary */ return 0; return restrict_link_by_builtin_and_secondary_trusted(dest_keyring, type, payload, restrict_key); } #endif /* * Create the trusted keyrings */ static __init int system_trusted_keyring_init(void) { pr_notice("Initialise system trusted keyrings\n"); builtin_trusted_keys = keyring_alloc(".builtin_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(builtin_trusted_keys)) panic("Can't allocate builtin trusted keyring\n"); #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING secondary_trusted_keys = keyring_alloc(".secondary_trusted_keys", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE), KEY_ALLOC_NOT_IN_QUOTA, get_builtin_and_secondary_restriction(), NULL); if (IS_ERR(secondary_trusted_keys)) panic("Can't allocate secondary trusted keyring\n"); if (key_link(secondary_trusted_keys, builtin_trusted_keys) < 0) panic("Can't link trusted keyrings\n"); #endif return 0; } /* * Must be initialised before we try and load the keys into the keyring. */ device_initcall(system_trusted_keyring_init); __init int load_module_cert(struct key *keyring) { if (!IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG)) return 0; pr_notice("Loading compiled-in module X.509 certificates\n"); return x509_load_certificate_list(system_certificate_list, module_cert_size, keyring); } /* * Load the compiled-in list of X.509 certificates. */ static __init int load_system_certificate_list(void) { const u8 *p; unsigned long size; pr_notice("Loading compiled-in X.509 certificates\n"); #ifdef CONFIG_MODULE_SIG p = system_certificate_list; size = system_certificate_list_size; #else p = system_certificate_list + module_cert_size; size = system_certificate_list_size - module_cert_size; #endif return x509_load_certificate_list(p, size, builtin_trusted_keys); } late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @pkcs7: The PKCS#7 message that is the signature. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_message_sig(const void *data, size_t len, struct pkcs7_message *pkcs7, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { int ret; /* The data should be detached - so we need to supply it. */ if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; } ret = pkcs7_verify(pkcs7, usage); if (ret < 0) goto error; ret = is_key_on_revocation_list(pkcs7); if (ret != -ENOKEY) { pr_devel("PKCS#7 key is on revocation list\n"); goto error; } if (!trusted_keys) { trusted_keys = builtin_trusted_keys; } else if (trusted_keys == VERIFY_USE_SECONDARY_KEYRING) { #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING trusted_keys = secondary_trusted_keys; #else trusted_keys = builtin_trusted_keys; #endif } else if (trusted_keys == VERIFY_USE_PLATFORM_KEYRING) { #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING trusted_keys = platform_trusted_keys; #else trusted_keys = NULL; #endif if (!trusted_keys) { ret = -ENOKEY; pr_devel("PKCS#7 platform keyring is not available\n"); goto error; } } ret = pkcs7_validate_trust(pkcs7, trusted_keys); if (ret < 0) { if (ret == -ENOKEY) pr_devel("PKCS#7 signature not signed with a trusted key\n"); goto error; } if (view_content) { size_t asn1hdrlen; ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); if (ret < 0) { if (ret == -ENODATA) pr_devel("PKCS#7 message does not contain data\n"); goto error; } ret = view_content(ctx, data, len, asn1hdrlen); } error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /** * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only, * (void *)1UL for all trusted keys). * @usage: The use to which the key is being put. * @view_content: Callback to gain access to content. * @ctx: Context for callback. */ int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, size_t asn1hdrlen), void *ctx) { struct pkcs7_message *pkcs7; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); if (IS_ERR(pkcs7)) return PTR_ERR(pkcs7); ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage, view_content, ctx); pkcs7_free_message(pkcs7); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_pkcs7_signature); #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ #ifdef CONFIG_INTEGRITY_PLATFORM_KEYRING void __init set_platform_trusted_keys(struct key *keyring) { platform_trusted_keys = keyring; } #endif
17 10 21 25 1 2 2 1 1 1 43 4 3 17 18 45 43 4 15 19 6 11 17 2 25 32 1 2 13 16 15 14 21 8 11 18 12 10 2 11 9 7 13 17 6 3 3 15 21 6 15 10 11 11 10 21 21 30 1 29 25 25 18 10 8 1 18 10 8 1 2 6 6 10 11 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_tbf.c Token Bucket Filter queue. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs - * original idea by Martin Devera */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> /* Simple Token Bucket Filter. ======================================= SOURCE. ------- None. Description. ------------ A data flow obeys TBF with rate R and depth B, if for any time interval t_i...t_f the number of transmitted bits does not exceed B + R*(t_f-t_i). Packetized version of this definition: The sequence of packets of sizes s_i served at moments t_i obeys TBF, if for any i<=k: s_i+....+s_k <= B + R*(t_k - t_i) Algorithm. ---------- Let N(t_i) be B/R initially and N(t) grow continuously with time as: N(t+delta) = min{B/R, N(t) + delta} If the first packet in queue has length S, it may be transmitted only at the time t_* when S/R <= N(t_*), and in this case N(t) jumps: N(t_* + 0) = N(t_* - 0) - S/R. Actually, QoS requires two TBF to be applied to a data stream. One of them controls steady state burst size, another one with rate P (peak rate) and depth M (equal to link MTU) limits bursts at a smaller time scale. It is easy to see that P>R, and B>M. If P is infinity, this double TBF is equivalent to a single one. When TBF works in reshaping mode, latency is estimated as: lat = max ((L-B)/R, (L-M)/P) NOTES. ------ If TBF throttles, it starts a watchdog timer, which will wake it up when it is ready to transmit. Note that the minimal timer resolution is 1/HZ. If no new packets arrive during this period, or if the device is not awaken by EOI for some previous packet, TBF can stop its activity for 1/HZ. This means, that with depth B, the maximal rate is R_crit = B*HZ F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. Note that the peak rate TBF is much more tough: with MTU 1500 P_crit = 150Kbytes/sec. So, if you need greater peak rates, use alpha with HZ=1000 :-) With classful TBF, limit is just kept for backwards compatibility. It is passed to the default bfifo qdisc - if the inner qdisc is changed the limit is not effective anymore. */ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ u32 max_size; s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ s64 mtu; struct psched_ratecfg rate; struct psched_ratecfg peak; /* Variables */ s64 tokens; /* Current number of B tokens */ s64 ptokens; /* Current number of P tokens */ s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ struct qdisc_watchdog watchdog; /* Watchdog timer */ }; /* Time to Length, convert time in ns to length in bytes * to determinate how many bytes can be sent in given time. */ static u64 psched_ns_t2l(const struct psched_ratecfg *r, u64 time_in_ns) { /* The formula is : * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC */ u64 len = time_in_ns * r->rate_bytes_ps; do_div(len, NSEC_PER_SEC); if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) { do_div(len, 53); len = len * 48; } if (len > r->overhead) len -= r->overhead; else len = 0; return len; } static void tbf_offload_change(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_tbf_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_TBF_REPLACE; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.replace_params.rate = q->rate; qopt.replace_params.max_size = q->max_size; qopt.replace_params.qstats = &sch->qstats; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); } static void tbf_offload_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct tc_tbf_qopt_offload qopt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; qopt.command = TC_TBF_DESTROY; qopt.handle = sch->handle; qopt.parent = sch->parent; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt); } static int tbf_offload_dump(struct Qdisc *sch) { struct tc_tbf_qopt_offload qopt; qopt.command = TC_TBF_STATS; qopt.handle = sch->handle; qopt.parent = sch->parent; qopt.stats.bstats = &sch->bstats; qopt.stats.qstats = &sch->qstats; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); } static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct tc_tbf_qopt_offload graft_offload = { .handle = sch->handle, .parent = sch->parent, .child_handle = new->handle, .command = TC_TBF_GRAFT, }; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, TC_SETUP_QDISC_TBF, &graft_offload, extack); } /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); nb = 0; skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { nb++; len += seg_len; } } sch->q.qlen += nb; sch->qstats.backlog += len; if (nb > 0) { qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); consume_skb(skb); return NET_XMIT_SUCCESS; } kfree_skb(skb); return NET_XMIT_DROP; } static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct tbf_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); int ret; if (qdisc_pkt_len(skb) > q->max_size) { if (skb_is_gso(skb) && skb_gso_validate_mac_len(skb, q->max_size)) return tbf_segment(skb, sch, to_free); return qdisc_drop(skb, sch, to_free); } ret = qdisc_enqueue(skb, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } static bool tbf_peak_present(const struct tbf_sched_data *q) { return q->peak.rate_bytes_ps; } static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; skb = q->qdisc->ops->peek(q->qdisc); if (skb) { s64 now; s64 toks; s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); now = ktime_get_ns(); toks = min_t(s64, now - q->t_c, q->buffer); if (tbf_peak_present(q)) { ptoks = toks + q->ptokens; if (ptoks > q->mtu) ptoks = q->mtu; ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; if (toks > q->buffer) toks = q->buffer; toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { skb = qdisc_dequeue_peeked(q->qdisc); if (unlikely(!skb)) return NULL; q->t_c = now; q->tokens = toks; q->ptokens = ptoks; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); return skb; } qdisc_watchdog_schedule_ns(&q->watchdog, now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, but, however, this is wrong in principle. We MUST NOT reorder packets under these circumstances. Really, if we split the flow into independent subflows, it would be a very good solution. This is the main idea of all FQ algorithms (cf. CSZ, HPFQ, HFSC) */ qdisc_qstats_overlimit(sch); } return NULL; } static void tbf_reset(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); q->t_c = ktime_get_ns(); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); } static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) }, [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_TBF_RATE64] = { .type = NLA_U64 }, [TCA_TBF_PRATE64] = { .type = NLA_U64 }, [TCA_TBF_BURST] = { .type = NLA_U32 }, [TCA_TBF_PBURST] = { .type = NLA_U32 }, }; static int tbf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { int err; struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct Qdisc *child = NULL; struct Qdisc *old = NULL; struct psched_ratecfg rate; struct psched_ratecfg peak; u64 max_size; s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy, NULL); if (err < 0) return err; err = -EINVAL; if (tb[TCA_TBF_PARMS] == NULL) goto done; qopt = nla_data(tb[TCA_TBF_PARMS]); if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB], NULL)); if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB], NULL)); buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U); mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U); if (tb[TCA_TBF_RATE64]) rate64 = nla_get_u64(tb[TCA_TBF_RATE64]); psched_ratecfg_precompute(&rate, &qopt->rate, rate64); if (tb[TCA_TBF_BURST]) { max_size = nla_get_u32(tb[TCA_TBF_BURST]); buffer = psched_l2t_ns(&rate, max_size); } else { max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U); } if (qopt->peakrate.rate) { if (tb[TCA_TBF_PRATE64]) prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]); psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64); if (peak.rate_bytes_ps <= rate.rate_bytes_ps) { pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n", peak.rate_bytes_ps, rate.rate_bytes_ps); err = -EINVAL; goto done; } if (tb[TCA_TBF_PBURST]) { u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]); max_size = min_t(u32, max_size, pburst); mtu = psched_l2t_ns(&peak, pburst); } else { max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu)); } } else { memset(&peak, 0, sizeof(peak)); } if (max_size < psched_mtu(qdisc_dev(sch))) pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n", max_size, qdisc_dev(sch)->name, psched_mtu(qdisc_dev(sch))); if (!max_size) { err = -EINVAL; goto done; } if (q->qdisc != &noop_qdisc) { err = fifo_set_limit(q->qdisc, qopt->limit); if (err) goto done; } else if (qopt->limit > 0) { child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit, extack); if (IS_ERR(child)) { err = PTR_ERR(child); goto done; } /* child is fifo, no need to check for noop_qdisc */ qdisc_hash_add(child, true); } sch_tree_lock(sch); if (child) { qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } q->limit = qopt->limit; if (tb[TCA_TBF_PBURST]) q->mtu = mtu; else q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; if (tb[TCA_TBF_BURST]) q->buffer = buffer; else q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg)); memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); qdisc_put(old); err = 0; tbf_offload_change(sch); done: return err; } static int tbf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; if (!opt) return -EINVAL; q->t_c = ktime_get_ns(); return tbf_change(sch, opt, extack); } static void tbf_destroy(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); tbf_offload_destroy(sch); qdisc_put(q->qdisc); } static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tbf_sched_data *q = qdisc_priv(sch); struct nlattr *nest; struct tc_tbf_qopt opt; int err; err = tbf_offload_dump(sch); if (err) return err; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; opt.limit = q->limit; psched_ratecfg_getrate(&opt.rate, &q->rate); if (tbf_peak_present(q)) psched_ratecfg_getrate(&opt.peakrate, &q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); opt.mtu = PSCHED_NS2TICKS(q->mtu); opt.buffer = PSCHED_NS2TICKS(q->buffer); if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (q->rate.rate_bytes_ps >= (1ULL << 32) && nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps, TCA_TBF_PAD)) goto nla_put_failure; if (tbf_peak_present(q) && q->peak.rate_bytes_ps >= (1ULL << 32) && nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps, TCA_TBF_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct tbf_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct tbf_sched_data *q = qdisc_priv(sch); if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); tbf_offload_graft(sch, new, *old, extack); return 0; } static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) { struct tbf_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long tbf_find(struct Qdisc *sch, u32 classid) { return 1; } static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static const struct Qdisc_class_ops tbf_class_ops = { .graft = tbf_graft, .leaf = tbf_leaf, .find = tbf_find, .walk = tbf_walk, .dump = tbf_dump_class, }; static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &tbf_class_ops, .id = "tbf", .priv_size = sizeof(struct tbf_sched_data), .enqueue = tbf_enqueue, .dequeue = tbf_dequeue, .peek = qdisc_peek_dequeued, .init = tbf_init, .reset = tbf_reset, .destroy = tbf_destroy, .change = tbf_change, .dump = tbf_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("tbf"); static int __init tbf_module_init(void) { return register_qdisc(&tbf_qdisc_ops); } static void __exit tbf_module_exit(void) { unregister_qdisc(&tbf_qdisc_ops); } module_init(tbf_module_init) module_exit(tbf_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Token Bucket Filter qdisc");
5 5 1 3 15 15 15 7 9 11 10 5 5 7 12 12 2 7 1 1 5 14 1 3 11 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 // SPDX-License-Identifier: GPL-2.0-or-later /* XTS: as defined in IEEE1619/D16 * http://grouper.ieee.org/groups/1619/email/pdf00086.pdf * * Copyright (c) 2007 Rik Snel <rsnel@cube.dyndns.org> * * Based on ecb.c * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <crypto/xts.h> #include <crypto/b128ops.h> #include <crypto/gf128mul.h> struct xts_tfm_ctx { struct crypto_skcipher *child; struct crypto_cipher *tweak; }; struct xts_instance_ctx { struct crypto_skcipher_spawn spawn; struct crypto_cipher_spawn tweak_spawn; }; struct xts_request_ctx { le128 t; struct scatterlist *tail; struct scatterlist sg[2]; struct skcipher_request subreq; }; static int xts_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child; struct crypto_cipher *tweak; int err; err = xts_verify_key(parent, key, keylen); if (err) return err; keylen /= 2; /* we need two cipher instances: one to compute the initial 'tweak' * by encrypting the IV (usually the 'plain' iv) and the other * one to encrypt and decrypt the data */ /* tweak cipher, uses Key2 i.e. the second half of *key */ tweak = ctx->tweak; crypto_cipher_clear_flags(tweak, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tweak, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); err = crypto_cipher_setkey(tweak, key + keylen, keylen); if (err) return err; /* data cipher, uses Key1 i.e. the first half of *key */ child = ctx->child; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } /* * We compute the tweak masks twice (both before and after the ECB encryption or * decryption) to avoid having to allocate a temporary buffer and/or make * mutliple calls to the 'ecb(..)' instance, which usually would be slower than * just doing the gf128mul_x_ble() calls again. */ static int xts_xor_tweak(struct skcipher_request *req, bool second_pass, bool enc) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); const int bs = XTS_BLOCK_SIZE; struct skcipher_walk w; le128 t = rctx->t; int err; if (second_pass) { req = &rctx->subreq; /* set to our TFM to enforce correct alignment: */ skcipher_request_set_tfm(req, tfm); } err = skcipher_walk_virt(&w, req, false); while (w.nbytes) { unsigned int avail = w.nbytes; const le128 *wsrc; le128 *wdst; wsrc = w.src.virt.addr; wdst = w.dst.virt.addr; do { if (unlikely(cts) && w.total - w.nbytes + avail < 2 * XTS_BLOCK_SIZE) { if (!enc) { if (second_pass) rctx->t = t; gf128mul_x_ble(&t, &t); } le128_xor(wdst, &t, wsrc); if (enc && second_pass) gf128mul_x_ble(&rctx->t, &t); skcipher_walk_done(&w, avail - bs); return 0; } le128_xor(wdst++, &t, wsrc++); gf128mul_x_ble(&t, &t); } while ((avail -= bs) >= bs); err = skcipher_walk_done(&w, avail); } return err; } static int xts_xor_tweak_pre(struct skcipher_request *req, bool enc) { return xts_xor_tweak(req, false, enc); } static int xts_xor_tweak_post(struct skcipher_request *req, bool enc) { return xts_xor_tweak(req, true, enc); } static void xts_cts_done(void *data, int err) { struct skcipher_request *req = data; le128 b; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); scatterwalk_map_and_copy(&b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); le128_xor(&b, &rctx->t, &b); scatterwalk_map_and_copy(&b, rctx->tail, 0, XTS_BLOCK_SIZE, 1); } skcipher_request_complete(req, err); } static int xts_cts_final(struct skcipher_request *req, int (*crypt)(struct skcipher_request *req)) { const struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); int offset = req->cryptlen & ~(XTS_BLOCK_SIZE - 1); struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int tail = req->cryptlen % XTS_BLOCK_SIZE; le128 b[2]; int err; rctx->tail = scatterwalk_ffwd(rctx->sg, req->dst, offset - XTS_BLOCK_SIZE); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); b[1] = b[0]; scatterwalk_map_and_copy(b, req->src, offset, tail, 0); le128_xor(b, &rctx->t, b); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE + tail, 1); skcipher_request_set_tfm(subreq, ctx->child); skcipher_request_set_callback(subreq, req->base.flags, xts_cts_done, req); skcipher_request_set_crypt(subreq, rctx->tail, rctx->tail, XTS_BLOCK_SIZE, NULL); err = crypt(subreq); if (err) return err; scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 0); le128_xor(b, &rctx->t, b); scatterwalk_map_and_copy(b, rctx->tail, 0, XTS_BLOCK_SIZE, 1); return 0; } static void xts_encrypt_done(void *data, int err) { struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, true); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_encrypt); if (err == -EINPROGRESS || err == -EBUSY) return; } } skcipher_request_complete(req, err); } static void xts_decrypt_done(void *data, int err) { struct skcipher_request *req = data; if (!err) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); rctx->subreq.base.flags &= CRYPTO_TFM_REQ_MAY_BACKLOG; err = xts_xor_tweak_post(req, false); if (!err && unlikely(req->cryptlen % XTS_BLOCK_SIZE)) { err = xts_cts_final(req, crypto_skcipher_decrypt); if (err == -EINPROGRESS || err == -EBUSY) return; } } skcipher_request_complete(req, err); } static int xts_init_crypt(struct skcipher_request *req, crypto_completion_t compl) { const struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; if (req->cryptlen < XTS_BLOCK_SIZE) return -EINVAL; skcipher_request_set_tfm(subreq, ctx->child); skcipher_request_set_callback(subreq, req->base.flags, compl, req); skcipher_request_set_crypt(subreq, req->dst, req->dst, req->cryptlen & ~(XTS_BLOCK_SIZE - 1), NULL); /* calculate first value of T */ crypto_cipher_encrypt_one(ctx->tweak, (u8 *)&rctx->t, req->iv); return 0; } static int xts_encrypt(struct skcipher_request *req) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int err; err = xts_init_crypt(req, xts_encrypt_done) ?: xts_xor_tweak_pre(req, true) ?: crypto_skcipher_encrypt(subreq) ?: xts_xor_tweak_post(req, true); if (err || likely((req->cryptlen % XTS_BLOCK_SIZE) == 0)) return err; return xts_cts_final(req, crypto_skcipher_encrypt); } static int xts_decrypt(struct skcipher_request *req) { struct xts_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; int err; err = xts_init_crypt(req, xts_decrypt_done) ?: xts_xor_tweak_pre(req, false) ?: crypto_skcipher_decrypt(subreq) ?: xts_xor_tweak_post(req, false); if (err || likely((req->cryptlen % XTS_BLOCK_SIZE) == 0)) return err; return xts_cts_final(req, crypto_skcipher_decrypt); } static int xts_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct xts_instance_ctx *ictx = skcipher_instance_ctx(inst); struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child; struct crypto_cipher *tweak; child = crypto_spawn_skcipher(&ictx->spawn); if (IS_ERR(child)) return PTR_ERR(child); ctx->child = child; tweak = crypto_spawn_cipher(&ictx->tweak_spawn); if (IS_ERR(tweak)) { crypto_free_skcipher(ctx->child); return PTR_ERR(tweak); } ctx->tweak = tweak; crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(child) + sizeof(struct xts_request_ctx)); return 0; } static void xts_exit_tfm(struct crypto_skcipher *tfm) { struct xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); crypto_free_cipher(ctx->tweak); } static void xts_free_instance(struct skcipher_instance *inst) { struct xts_instance_ctx *ictx = skcipher_instance_ctx(inst); crypto_drop_skcipher(&ictx->spawn); crypto_drop_cipher(&ictx->tweak_spawn); kfree(inst); } static int xts_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_alg_common *alg; char name[CRYPTO_MAX_ALG_NAME]; struct skcipher_instance *inst; struct xts_instance_ctx *ctx; const char *cipher_name; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(&ctx->spawn, skcipher_crypto_instance(inst), cipher_name, 0, mask); if (err == -ENOENT && memcmp(cipher_name, "ecb(", 4)) { err = -ENAMETOOLONG; if (snprintf(name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; err = crypto_grab_skcipher(&ctx->spawn, skcipher_crypto_instance(inst), name, 0, mask); } if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(&ctx->spawn); err = -EINVAL; if (alg->base.cra_blocksize != XTS_BLOCK_SIZE) goto err_free_inst; if (alg->ivsize) goto err_free_inst; err = crypto_inst_setname(skcipher_crypto_instance(inst), "xts", &alg->base); if (err) goto err_free_inst; err = -EINVAL; cipher_name = alg->base.cra_name; /* Alas we screwed up the naming so we have to mangle the * cipher name. */ if (!memcmp(cipher_name, "ecb(", 4)) { int len; len = strscpy(name, cipher_name + 4, sizeof(name)); if (len < 2) goto err_free_inst; if (name[len - 1] != ')') goto err_free_inst; name[len - 1] = 0; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "xts(%s)", name) >= CRYPTO_MAX_ALG_NAME) { err = -ENAMETOOLONG; goto err_free_inst; } } else goto err_free_inst; err = crypto_grab_cipher(&ctx->tweak_spawn, skcipher_crypto_instance(inst), name, 0, mask); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = XTS_BLOCK_SIZE; inst->alg.base.cra_alignmask = alg->base.cra_alignmask | (__alignof__(u64) - 1); inst->alg.ivsize = XTS_BLOCK_SIZE; inst->alg.min_keysize = alg->min_keysize * 2; inst->alg.max_keysize = alg->max_keysize * 2; inst->alg.base.cra_ctxsize = sizeof(struct xts_tfm_ctx); inst->alg.init = xts_init_tfm; inst->alg.exit = xts_exit_tfm; inst->alg.setkey = xts_setkey; inst->alg.encrypt = xts_encrypt; inst->alg.decrypt = xts_decrypt; inst->free = xts_free_instance; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: xts_free_instance(inst); } return err; } static struct crypto_template xts_tmpl = { .name = "xts", .create = xts_create, .module = THIS_MODULE, }; static int __init xts_module_init(void) { return crypto_register_template(&xts_tmpl); } static void __exit xts_module_exit(void) { crypto_unregister_template(&xts_tmpl); } module_init(xts_module_init); module_exit(xts_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("XTS block cipher mode"); MODULE_ALIAS_CRYPTO("xts"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); MODULE_SOFTDEP("pre: ecb");
9 9 9 9 9 9 9 9 5 5 5 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 // SPDX-License-Identifier: GPL-2.0-or-later /* Request a key from userspace * * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/security/keys/request-key.rst */ #include <linux/export.h> #include <linux/sched.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/keyctl.h> #include <linux/slab.h> #include <net/net_namespace.h> #include "internal.h" #include <keys/request_key_auth-type.h> #define key_negative_timeout 60 /* default timeout on a negative key's existence */ static struct key *check_cached_key(struct keyring_search_context *ctx) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct key *key = current->cached_requested_key; if (key && ctx->match_data.cmp(key, &ctx->match_data) && !(key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED)))) return key_get(key); #endif return NULL; } static void cache_requested_key(struct key *key) { #ifdef CONFIG_KEYS_REQUEST_CACHE struct task_struct *t = current; /* Do not cache key if it is a kernel thread */ if (!(t->flags & PF_KTHREAD)) { key_put(t->cached_requested_key); t->cached_requested_key = key_get(key); set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } #endif } /** * complete_request_key - Complete the construction of a key. * @authkey: The authorisation key. * @error: The success or failute of the construction. * * Complete the attempt to construct a key. The key will be negated * if an error is indicated. The authorisation key will be revoked * unconditionally. */ void complete_request_key(struct key *authkey, int error) { struct request_key_auth *rka = get_request_key_auth(authkey); struct key *key = rka->target_key; kenter("%d{%d},%d", authkey->serial, key->serial, error); if (error < 0) key_negate_and_link(key, key_negative_timeout, NULL, authkey); else key_revoke(authkey); } EXPORT_SYMBOL(complete_request_key); /* * Initialise a usermode helper that is going to have a specific session * keyring. * * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); } /* * Clean up a usermode helper with session keyring. */ static void umh_keys_cleanup(struct subprocess_info *info) { struct key *keyring = info->data; key_put(keyring); } /* * Call a usermode helper with a specific session keyring. */ static int call_usermodehelper_keys(const char *path, char **argv, char **envp, struct key *session_keyring, int wait) { struct subprocess_info *info; info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL, umh_keys_init, umh_keys_cleanup, session_keyring); if (!info) return -ENOMEM; key_get(session_keyring); return call_usermodehelper_exec(info, wait); } /* * Request userspace finish the construction of a key * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>" */ static int call_sbin_request_key(struct key *authkey, void *aux) { static char const request_key[] = "/sbin/request-key"; struct request_key_auth *rka = get_request_key_auth(authkey); const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = rka->target_key, *keyring, *session, *user_session; char *argv[9], *envp[3], uid_str[12], gid_str[12]; char key_str[12], keyring_str[3][12]; char desc[20]; int ret, i; kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op); ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error_us; /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; } /* attach the auth key to the session keyring */ ret = key_link(keyring, authkey); if (ret < 0) goto error_link; /* record the UID and GID */ sprintf(uid_str, "%d", from_kuid(&init_user_ns, cred->fsuid)); sprintf(gid_str, "%d", from_kgid(&init_user_ns, cred->fsgid)); /* we say which key is under construction */ sprintf(key_str, "%d", key->serial); /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; if (cred->process_keyring) prkey = cred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); session = cred->session_keyring; if (!session) session = user_session; sskey = session->serial; sprintf(keyring_str[2], "%d", sskey); /* set up a minimal environment */ i = 0; envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; /* set up the argument list */ i = 0; argv[i++] = (char *)request_key; argv[i++] = (char *)rka->op; argv[i++] = key_str; argv[i++] = uid_str; argv[i++] = gid_str; argv[i++] = keyring_str[0]; argv[i++] = keyring_str[1]; argv[i++] = keyring_str[2]; argv[i] = NULL; /* do it */ ret = call_usermodehelper_keys(request_key, argv, envp, keyring, UMH_WAIT_PROC); kdebug("usermode -> 0x%x", ret); if (ret >= 0) { /* ret is the exit/wait code */ if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags) || key_validate(key) < 0) ret = -ENOKEY; else /* ignore any errors from userspace if the key was * instantiated */ ret = 0; } error_link: key_put(keyring); error_alloc: key_put(user_session); error_us: complete_request_key(authkey, ret); kleave(" = %d", ret); return ret; } /* * Call out to userspace for key construction. * * Program failure is ignored in favour of key status. */ static int construct_key(struct key *key, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring) { request_key_actor_t actor; struct key *authkey; int ret; kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux); /* allocate an authorisation key */ authkey = request_key_auth_new(key, "create", callout_info, callout_len, dest_keyring); if (IS_ERR(authkey)) return PTR_ERR(authkey); /* Make the call */ actor = call_sbin_request_key; if (key->type->request_key) actor = key->type->request_key; ret = actor(authkey, aux); /* check that the actor called complete_request_key() prior to * returning an error */ WARN_ON(ret < 0 && !test_bit(KEY_FLAG_INVALIDATED, &authkey->flags)); key_put(authkey); kleave(" = %d", ret); return ret; } /* * Get the appropriate destination keyring for the request. * * The keyring selected is returned with an extra reference upon it which the * caller must release. */ static int construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; int ret; kenter("%p", dest_keyring); /* find the appropriate keyring */ if (dest_keyring) { /* the caller supplied one */ key_get(dest_keyring); } else { bool do_perm_check = true; /* use a default keyring; falling through the cases until we * find one that we actually have */ switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); rka = get_request_key_auth(authkey); if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = key_get(rka->dest_keyring); up_read(&authkey->sem); if (dest_keyring) { do_perm_check = false; break; } } fallthrough; case KEY_REQKEY_DEFL_THREAD_KEYRING: dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_PROCESS_KEYRING: dest_keyring = key_get(cred->process_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_SESSION_KEYRING: dest_keyring = key_get(cred->session_keyring); if (dest_keyring) break; fallthrough; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &dest_keyring); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_USER_KEYRING: ret = look_up_user_keyrings(&dest_keyring, NULL); if (ret < 0) return ret; break; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: BUG(); } /* * Require Write permission on the keyring. This is essential * because the default keyring may be the session keyring, and * joining a keyring only requires Search permission. * * However, this check is skipped for the "requestor keyring" so * that /sbin/request-key can itself use request_key() to add * keys to the original requestor's destination keyring. */ if (dest_keyring && do_perm_check) { ret = key_permission(make_key_ref(dest_keyring, 1), KEY_NEED_WRITE); if (ret) { key_put(dest_keyring); return ret; } } } *_dest_keyring = dest_keyring; kleave(" [dk %d]", key_serial(dest_keyring)); return 0; } /* * Allocate a new key in under-construction state and attempt to link it in to * the requested keyring. * * May return a key that's already under construction instead if there was a * race between two thread calling request_key(). */ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, struct key **_key) { struct assoc_array_edit *edit = NULL; struct key *key; key_perm_t perm; key_ref_t key_ref; int ret; kenter("%s,%s,,,", ctx->index_key.type->name, ctx->index_key.description); *_key = NULL; mutex_lock(&user->cons_lock); perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (ctx->index_key.type->read) perm |= KEY_POS_READ; if (ctx->index_key.type == &key_type_keyring || ctx->index_key.type->update) perm |= KEY_POS_WRITE; key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, perm, flags, NULL); if (IS_ERR(key)) goto alloc_failed; set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { ret = __key_link_lock(dest_keyring, &key->index_key); if (ret < 0) goto link_lock_failed; } /* * Attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we * waited for locks. * * The caller might specify a comparison function which looks for keys * that do not exactly match but are still equivalent from the caller's * perspective. The __key_link_begin() operation must be done only after * an actual key is determined. */ mutex_lock(&key_construction_mutex); rcu_read_lock(); key_ref = search_process_keyrings_rcu(ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) goto key_already_present; if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed; __key_link(dest_keyring, key, &edit); } mutex_unlock(&key_construction_mutex); if (dest_keyring) __key_link_end(dest_keyring, &key->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); return 0; /* the key is now present - we tell the caller that we found it by * returning -EINPROGRESS */ key_already_present: key_put(key); mutex_unlock(&key_construction_mutex); key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = __key_link_begin(dest_keyring, &key->index_key, &edit); if (ret < 0) goto link_alloc_failed_unlocked; ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) __key_link(dest_keyring, key, &edit); __key_link_end(dest_keyring, &key->index_key, edit); if (ret < 0) goto link_check_failed; } mutex_unlock(&user->cons_lock); *_key = key; kleave(" = -EINPROGRESS [%d]", key_serial(key)); return -EINPROGRESS; link_check_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [linkcheck]", ret); return ret; link_alloc_failed: mutex_unlock(&key_construction_mutex); link_alloc_failed_unlocked: __key_link_end(dest_keyring, &key->index_key, edit); link_lock_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [prelink]", ret); return ret; alloc_failed: mutex_unlock(&user->cons_lock); kleave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } /* * Commence key construction. */ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct key_user *user; struct key *key; int ret; kenter(""); if (ctx->index_key.type == &key_type_keyring) return ERR_PTR(-EPERM); ret = construct_get_dest_keyring(&dest_keyring); if (ret) goto error; user = key_user_lookup(current_fsuid()); if (!user) { ret = -ENOMEM; goto error_put_dest_keyring; } ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { ret = construct_key(key, callout_info, callout_len, aux, dest_keyring); if (ret < 0) { kdebug("cons failed"); goto construction_failed; } } else if (ret == -EINPROGRESS) { ret = 0; } else { goto error_put_dest_keyring; } key_put(dest_keyring); kleave(" = key %d", key_serial(key)); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); error_put_dest_keyring: key_put(dest_keyring); error: kleave(" = %d", ret); return ERR_PTR(ret); } /** * request_key_and_link - Request a key and cache it in a keyring. * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * * A key matching the specified criteria (type, description, domain_tag) is * searched for in the process's keyrings and returned with its usage count * incremented if found. Otherwise, if callout_info is not NULL, a key will be * allocated and some service (probably in userspace) will be asked to * instantiate it. * * If successfully found or created, the key will be linked to the destination * keyring if one is provided. * * Returns a pointer to the key if successful; -EACCES, -ENOKEY, -EKEYREVOKED * or -EKEYEXPIRED if an inaccessible, negative, revoked or expired key was * found; -ENOKEY if no key was found and no @callout_info was given; -EDQUOT * if insufficient key quota was available to create a new key; or -ENOMEM if * insufficient memory was available. * * If the returned key was created, then it may still be under construction, * and wait_for_key_construction() should be used to wait for that to complete. */ struct key *request_key_and_link(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED | KEYRING_SEARCH_RECURSE), }; struct key *key; key_ref_t key_ref; int ret; kenter("%s,%s,%p,%zu,%p,%p,%lx", ctx.index_key.type->name, ctx.index_key.description, callout_info, callout_len, aux, dest_keyring, flags); if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) { key = ERR_PTR(ret); goto error; } } key = check_cached_key(&ctx); if (key) goto error_free; /* search all the process keyrings for a key */ rcu_read_lock(); key_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) { if (dest_keyring) { ret = key_task_permission(key_ref, current_cred(), KEY_NEED_LINK); if (ret < 0) { key_ref_put(key_ref); key = ERR_PTR(ret); goto error_free; } } key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = key_link(dest_keyring, key); if (ret < 0) { key_put(key); key = ERR_PTR(ret); goto error_free; } } /* Only cache the key on immediate success */ cache_requested_key(key); } else if (PTR_ERR(key_ref) != -EAGAIN) { key = ERR_CAST(key_ref); } else { /* the search failed, but the keyrings were searchable, so we * should consult userspace if we can */ key = ERR_PTR(-ENOKEY); if (!callout_info) goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, aux, dest_keyring, flags); } error_free: if (type->match_free) type->match_free(&ctx.match_data); error: kleave(" = %p", key); return key; } /** * wait_for_key_construction - Wait for construction of a key to complete * @key: The key being waited for. * @intr: Whether to wait interruptibly. * * Wait for a key to finish being constructed. * * Returns 0 if successful; -ERESTARTSYS if the wait was interrupted; -ENOKEY * if the key was negated; or -EKEYREVOKED or -EKEYEXPIRED if the key was * revoked or expired. */ int wait_for_key_construction(struct key *key, bool intr) { int ret; ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; ret = key_read_state(key); if (ret < 0) return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); /** * request_key_tag - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, * the callout_info must be a NUL-terminated string and no auxiliary data can * be passed. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_tag(struct key_type *type, const char *description, struct key_tag *domain_tag, const char *callout_info) { struct key *key; size_t callout_len = 0; int ret; if (callout_info) callout_len = strlen(callout_info); key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, NULL, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_tag); /** * request_key_with_auxdata - Request a key with auxiliary data for the upcaller * @type: The type of key we want. * @description: The searchable description of the key. * @domain_tag: The domain in which the key operates. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_with_auxdata(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux) { struct key *key; int ret; key = request_key_and_link(type, description, domain_tag, callout_info, callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_with_auxdata); /** * request_key_rcu - Request key from RCU-read-locked context * @type: The type of key we want. * @description: The name of the key we want. * @domain_tag: The domain in which the key operates. * * Request a key from a context that we may not sleep in (such as RCU-mode * pathwalk). Keys under construction are ignored. * * Return a pointer to the found key if successful, -ENOKEY if we couldn't find * a key or some other error if the key found was unsuitable or inaccessible. */ struct key *request_key_rcu(struct key_type *type, const char *description, struct key_tag *domain_tag) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.domain_tag = domain_tag, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED), }; struct key *key; key_ref_t key_ref; kenter("%s,%s", type->name, description); key = check_cached_key(&ctx); if (key) return key; /* search all the process keyrings for a key */ key_ref = search_process_keyrings_rcu(&ctx); if (IS_ERR(key_ref)) { key = ERR_CAST(key_ref); if (PTR_ERR(key_ref) == -EAGAIN) key = ERR_PTR(-ENOKEY); } else { key = key_ref_to_ptr(key_ref); cache_requested_key(key); } kleave(" = %p", key); return key; } EXPORT_SYMBOL(request_key_rcu);
21 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_AEAD_H #define _CRYPTO_INTERNAL_AEAD_H #include <crypto/aead.h> #include <crypto/algapi.h> #include <linux/stddef.h> #include <linux/types.h> struct rtattr; struct aead_instance { void (*free)(struct aead_instance *inst); union { struct { char head[offsetof(struct aead_alg, base)]; struct crypto_instance base; } s; struct aead_alg alg; }; }; struct crypto_aead_spawn { struct crypto_spawn base; }; struct aead_queue { struct crypto_queue base; }; static inline void *crypto_aead_ctx(struct crypto_aead *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_aead_ctx_dma(struct crypto_aead *tfm) { return crypto_tfm_ctx_dma(&tfm->base); } static inline struct crypto_instance *aead_crypto_instance( struct aead_instance *inst) { return container_of(&inst->alg.base, struct crypto_instance, alg); } static inline struct aead_instance *aead_instance(struct crypto_instance *inst) { return container_of(&inst->alg, struct aead_instance, alg.base); } static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead) { return aead_instance(crypto_tfm_alg_instance(&aead->base)); } static inline void *aead_instance_ctx(struct aead_instance *inst) { return crypto_instance_ctx(aead_crypto_instance(inst)); } static inline void *aead_request_ctx(struct aead_request *req) { return req->__ctx; } static inline void *aead_request_ctx_dma(struct aead_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(aead_request_ctx(req), align); } static inline void aead_request_complete(struct aead_request *req, int err) { crypto_request_complete(&req->base, err); } static inline u32 aead_request_flags(struct aead_request *req) { return req->base.flags; } static inline struct aead_request *aead_request_cast( struct crypto_async_request *req) { return container_of(req, struct aead_request, base); } int crypto_grab_aead(struct crypto_aead_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct aead_alg *crypto_spawn_aead_alg( struct crypto_aead_spawn *spawn) { return container_of(spawn->base.alg, struct aead_alg, base); } static inline struct crypto_aead *crypto_spawn_aead( struct crypto_aead_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_aead_set_reqsize(struct crypto_aead *aead, unsigned int reqsize) { aead->reqsize = reqsize; } static inline void crypto_aead_set_reqsize_dma(struct crypto_aead *aead, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); aead->reqsize = reqsize; } static inline void aead_init_queue(struct aead_queue *queue, unsigned int max_qlen) { crypto_init_queue(&queue->base, max_qlen); } static inline unsigned int crypto_aead_alg_chunksize(struct aead_alg *alg) { return alg->chunksize; } /** * crypto_aead_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CCM. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_aead_chunksize(struct crypto_aead *tfm) { return crypto_aead_alg_chunksize(crypto_aead_alg(tfm)); } int crypto_register_aead(struct aead_alg *alg); void crypto_unregister_aead(struct aead_alg *alg); int crypto_register_aeads(struct aead_alg *algs, int count); void crypto_unregister_aeads(struct aead_alg *algs, int count); int aead_register_instance(struct crypto_template *tmpl, struct aead_instance *inst); #endif /* _CRYPTO_INTERNAL_AEAD_H */
5 5 4 1 2 4 4 3 1 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_policy.c: based on xfrm4_policy.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki * Split up af-specific portion * */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/l3mdev.h> static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl6.flowi6_mark = params->mark; memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); fl6.flowi4_proto = params->ipproto; fl6.uli = params->uli; dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { dst_release(dst); dst = ERR_PTR(err); } return dst; } static int xfrm6_get_saddr(xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; int err; dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; idev = ip6_dst_idev(dst); if (!idev) { dst_release(dst); return -EHOSTUNREACH; } dev = idev->dev; err = ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0, &saddr->in6); dst_release(dst); if (err) return -EHOSTUNREACH; return 0; } static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { struct rt6_info *rt = dst_rt6_info(xdst->route); xdst->u.dst.dev = dev; netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); if (!xdst->u.rt6.rt6i_idev) { netdev_put(dev, &xdst->u.dst.dev_tracker); return -ENODEV; } /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; rt6_uncached_list_add(&xdst->u.rt6); return 0; } static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh); } static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct dst_entry *path = xdst->route; path->ops->redirect(path, sk, skb); } static void xfrm6_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; dst_destroy_metrics_generic(dst); rt6_uncached_list_del(&xdst->u.rt6); if (likely(xdst->u.rt6.rt6i_idev)) in6_dev_put(xdst->u.rt6.rt6i_idev); xfrm_dst_destroy(xdst); } static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct xfrm_dst *xdst; xdst = (struct xfrm_dst *)dst; if (xdst->u.rt6.rt6i_idev->dev == dev) { struct inet6_dev *loopback_idev = in6_dev_get(dev_net(dev)->loopback_dev); do { in6_dev_put(xdst->u.rt6.rt6i_idev); xdst->u.rt6.rt6i_idev = loopback_idev; in6_dev_hold(loopback_idev); xdst = (struct xfrm_dst *)xfrm_dst_child(&xdst->u.dst); } while (xdst->u.dst.xfrm); __in6_dev_put(loopback_idev); } xfrm_dst_ifdown(dst, dev); } static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .update_pmtu = xfrm6_update_pmtu, .redirect = xfrm6_redirect, .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm6_dst_destroy, .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 32768, }; static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, }; static int __init xfrm6_policy_init(void) { return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo, AF_INET6); } static void xfrm6_policy_fini(void) { xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); } #ifdef CONFIG_SYSCTL static struct ctl_table xfrm6_policy_table[] = { { .procname = "xfrm6_gc_thresh", .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = xfrm6_policy_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL); if (!table) goto err_alloc; table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh; } hdr = register_net_sysctl_sz(net, "net/ipv6", table, ARRAY_SIZE(xfrm6_policy_table)); if (!hdr) goto err_reg; net->ipv6.sysctl.xfrm6_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { const struct ctl_table *table; if (!net->ipv6.sysctl.xfrm6_hdr) return; table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else /* CONFIG_SYSCTL */ static inline int xfrm6_net_sysctl_init(struct net *net) { return 0; } static inline void xfrm6_net_sysctl_exit(struct net *net) { } #endif static int __net_init xfrm6_net_init(struct net *net) { int ret; memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, sizeof(xfrm6_dst_ops_template)); ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); if (ret) return ret; ret = xfrm6_net_sysctl_init(net); if (ret) dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); return ret; } static void __net_exit xfrm6_net_exit(struct net *net) { xfrm6_net_sysctl_exit(net); dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); } static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; int __init xfrm6_init(void) { int ret; ret = xfrm6_policy_init(); if (ret) goto out; ret = xfrm6_state_init(); if (ret) goto out_policy; ret = xfrm6_protocol_init(); if (ret) goto out_state; ret = register_pernet_subsys(&xfrm6_net_ops); if (ret) goto out_protocol; ret = xfrm_nat_keepalive_init(AF_INET6); if (ret) goto out_nat_keepalive; out: return ret; out_nat_keepalive: unregister_pernet_subsys(&xfrm6_net_ops); out_protocol: xfrm6_protocol_fini(); out_state: xfrm6_state_fini(); out_policy: xfrm6_policy_fini(); goto out; } void xfrm6_fini(void) { xfrm_nat_keepalive_fini(AF_INET6); unregister_pernet_subsys(&xfrm6_net_ops); xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); }
34 13 23 19 10 7 12 12 12 14 13 3 13 52 20 20 8 12 12 9 12 20 35 22 14 5 14 8 6 2 24 10 14 14 14 39 35 4 39 73 81 5 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/fs.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "af_unix.h" struct unix_vertex { struct list_head edges; struct list_head entry; struct list_head scc_entry; unsigned long out_degree; unsigned long index; unsigned long scc_index; }; struct unix_edge { struct unix_sock *predecessor; struct unix_sock *successor; struct list_head vertex_entry; struct list_head stack_entry; }; struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); /* Socket ? */ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { struct socket *sock = SOCKET_I(inode); const struct proto_ops *ops; struct sock *sk = sock->sk; ops = READ_ONCE(sock->ops); /* PF_UNIX ? */ if (sk && ops && ops->family == PF_UNIX) return unix_sk(sk); } return NULL; } static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { /* If an embryo socket has a fd, * the listener indirectly holds the fd's refcnt. */ if (edge->successor->listener) return unix_sk(edge->successor->listener)->vertex; return edge->successor->vertex; } enum { UNIX_GRAPH_NOT_CYCLIC, UNIX_GRAPH_MAYBE_CYCLIC, UNIX_GRAPH_CYCLIC, }; static unsigned char unix_graph_state; static void unix_update_graph(struct unix_vertex *vertex) { /* If the receiver socket is not inflight, no cyclic * reference could be formed. */ if (!vertex) return; WRITE_ONCE(unix_graph_state, UNIX_GRAPH_MAYBE_CYCLIC); } static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { UNIX_VERTEX_INDEX_MARK1, UNIX_VERTEX_INDEX_MARK2, UNIX_VERTEX_INDEX_START, }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; vertex->scc_index = ++unix_vertex_max_scc_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); list_move_tail(&vertex->entry, &unix_unvisited_vertices); edge->predecessor->vertex = vertex; } vertex->out_degree++; list_add_tail(&edge->vertex_entry, &vertex->edges); unix_update_graph(unix_edge_successor(edge)); } static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!fpl->dead) unix_update_graph(unix_edge_successor(edge)); list_del(&edge->vertex_entry); vertex->out_degree--; if (!vertex->out_degree) { edge->predecessor->vertex = NULL; list_move_tail(&vertex->entry, &fpl->vertices); } } static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { list_del(&vertex->entry); kfree(vertex); } } static __cacheline_aligned_in_smp DEFINE_SPINLOCK(unix_gc_lock); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { int i = 0, j = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); struct unix_edge *edge; if (!inflight) continue; edge = fpl->edges + i++; edge->predecessor = inflight; edge->successor = receiver; unix_add_edge(fpl, edge); } while (i < fpl->count_unix); receiver->scm_stat.nr_unix_fds += fpl->count_unix; out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = true; unix_free_vertices(fpl); } void unix_del_edges(struct scm_fp_list *fpl) { struct unix_sock *receiver; int i = 0; spin_lock(&unix_gc_lock); if (!fpl->count_unix) goto out; do { struct unix_edge *edge = fpl->edges + i++; unix_del_edge(fpl, edge); } while (i < fpl->count_unix); if (!fpl->dead) { receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); spin_unlock(&unix_gc_lock); fpl->inflight = false; } void unix_update_edges(struct unix_sock *receiver) { /* nr_unix_fds is only updated under unix_state_lock(). * If it's 0 here, the embryo socket is not part of the * inflight graph, and GC will not see it, so no lock needed. */ if (!receiver->scm_stat.nr_unix_fds) { receiver->listener = NULL; } else { spin_lock(&unix_gc_lock); unix_update_graph(unix_sk(receiver->listener)->vertex); receiver->listener = NULL; spin_unlock(&unix_gc_lock); } } int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; int i; if (!fpl->count_unix) return 0; for (i = 0; i < fpl->count_unix; i++) { vertex = kmalloc_obj(*vertex); if (!vertex) goto err; list_add(&vertex->entry, &fpl->vertices); } fpl->edges = kvmalloc_objs(*fpl->edges, fpl->count_unix, GFP_KERNEL_ACCOUNT); if (!fpl->edges) goto err; unix_schedule_gc(fpl->user); return 0; err: unix_free_vertices(fpl); return -ENOMEM; } void unix_destroy_fpl(struct scm_fp_list *fpl) { if (fpl->inflight) unix_del_edges(fpl); kvfree(fpl->edges); unix_free_vertices(fpl); } static bool unix_vertex_dead(struct unix_vertex *vertex) { struct unix_edge *edge; struct unix_sock *u; long total_ref; list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); /* The vertex's fd can be received by a non-inflight socket. */ if (!next_vertex) return false; /* The vertex's fd can be received by an inflight socket in * another SCC. */ if (next_vertex->scc_index != vertex->scc_index) return false; } /* No receiver exists out of the same SCC. */ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; total_ref = file_count(u->sk.sk_socket->file); /* If not close()d, total_ref > out_degree. */ if (total_ref != vertex->out_degree) return false; return true; } static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; list_for_each_entry_reverse(vertex, scc, scc_entry) { struct sk_buff_head *queue; struct unix_edge *edge; struct unix_sock *u; edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); u = edge->predecessor; queue = &u->sk.sk_receive_queue; spin_lock(&queue->lock); if (u->sk.sk_state == TCP_LISTEN) { struct sk_buff *skb; skb_queue_walk(queue, skb) { struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; spin_lock(&embryo_queue->lock); skb_queue_splice_init(embryo_queue, hitlist); spin_unlock(&embryo_queue->lock); } } else { skb_queue_splice_init(queue, hitlist); } spin_unlock(&queue->lock); } } static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; struct unix_edge *edge; /* SCC containing multiple vertices ? */ if (!list_is_singular(scc)) return true; vertex = list_first_entry(scc, typeof(*vertex), scc_entry); /* Self-reference or a embryo-listener circle ? */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { if (unix_edge_successor(edge) == vertex) return true; } return false; } static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; static unsigned long __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, struct sk_buff_head *hitlist) { unsigned long cyclic_sccs = 0; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); next_vertex: /* Push vertex to vertex_stack and mark it as on-stack * (index >= UNIX_VERTEX_INDEX_START). * The vertex will be popped when finalising SCC later. */ list_add(&vertex->scc_entry, &vertex_stack); vertex->index = *last_index; vertex->scc_index = *last_index; (*last_index)++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; if (next_vertex->index == unix_vertex_unvisited_index) { /* Iterative deepening depth first search * * 1. Push a forward edge to edge_stack and set * the successor to vertex for the next iteration. */ list_add(&edge->stack_entry, &edge_stack); vertex = next_vertex; goto next_vertex; /* 2. Pop the edge directed to the current vertex * and restore the ancestor for backtracking. */ prev_vertex: edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); list_del_init(&edge->stack_entry); next_vertex = vertex; vertex = edge->predecessor->vertex; /* If the successor has a smaller scc_index, two vertices * are in the same SCC, so propagate the smaller scc_index * to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * * The successor is on vertex_stack, so two vertices are in * the same SCC. If the successor has a smaller *scc_index*, * propagate it to skip SCC finalisation. */ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else { /* The successor was already grouped as another SCC */ } } if (vertex->index == vertex->scc_index) { struct unix_vertex *v; struct list_head scc; bool scc_dead = true; /* SCC finalised. * * If the scc_index was not updated, all the vertices above on * vertex_stack are in the same SCC. Group them using scc_entry. */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); list_for_each_entry_reverse(v, &scc, scc_entry) { /* Don't restart DFS from this vertex in unix_walk_scc(). */ list_move_tail(&v->entry, &unix_visited_vertices); /* Mark vertex as off-stack. */ v->index = unix_vertex_grouped_index; if (scc_dead) scc_dead = unix_vertex_dead(v); } if (scc_dead) { unix_collect_skb(&scc, hitlist); } else { if (unix_vertex_max_scc_index < vertex->scc_index) unix_vertex_max_scc_index = vertex->scc_index; if (unix_scc_cyclic(&scc)) cyclic_sccs++; } list_del(&scc); } /* Need backtracking ? */ if (!list_empty(&edge_stack)) goto prev_vertex; return cyclic_sccs; } static unsigned long unix_graph_cyclic_sccs; static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; unsigned long cyclic_sccs = 0; unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); cyclic_sccs += __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); WRITE_ONCE(unix_graph_state, cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { unsigned long cyclic_sccs = unix_graph_cyclic_sccs; while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); list_for_each_entry_reverse(vertex, &scc, scc_entry) { list_move_tail(&vertex->entry, &unix_visited_vertices); if (scc_dead) scc_dead = unix_vertex_dead(vertex); } if (scc_dead) { cyclic_sccs--; unix_collect_skb(&scc, hitlist); } list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); WRITE_ONCE(unix_graph_state, cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static bool gc_in_progress; static void unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; struct sk_buff *skb; spin_lock(&unix_gc_lock); if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { spin_unlock(&unix_gc_lock); goto skip_gc; } __skb_queue_head_init(&hitlist); if (unix_graph_state == UNIX_GRAPH_CYCLIC) unix_walk_scc_fast(&hitlist); else unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); skb_queue_walk(&hitlist, skb) { if (UNIXCB(skb).fp) UNIXCB(skb).fp->dead = true; } __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } static DECLARE_WORK(unix_gc_work, unix_gc); #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) void unix_schedule_gc(struct user_struct *user) { if (READ_ONCE(unix_graph_state) == UNIX_GRAPH_NOT_CYCLIC) return; /* Penalise users who want to send AF_UNIX sockets * but whose sockets have not been received yet. */ if (user && READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; if (!READ_ONCE(gc_in_progress)) { WRITE_ONCE(gc_in_progress, true); queue_work(system_dfl_wq, &unix_gc_work); } if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); }
182 314 71 238 350 350 73 324 344 308 309 17 115 115 121 122 120 121 122 122 97 81 18 97 97 347 305 1 8 119 271 4 119 266 5 122 1 120 121 348 351 270 121 92 37 63 44 42 43 42 43 42 1 1 273 1 272 1 65 305 280 280 9 271 1 9 1 64 1 62 17 63 62 45 9 40 115 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #include <linux/err.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/param.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/completion.h> #include "internal.h" LIST_HEAD(crypto_alg_list); EXPORT_SYMBOL_GPL(crypto_alg_list); DECLARE_RWSEM(crypto_alg_sem); EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); #if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg, u32 type, u32 mask); static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) { return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL; } EXPORT_SYMBOL_GPL(crypto_mod_get); void crypto_mod_put(struct crypto_alg *alg) { struct module *module = alg->cra_module; crypto_alg_put(alg); module_put(module); } EXPORT_SYMBOL_GPL(crypto_mod_put); static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask) __must_hold_shared(&crypto_alg_sem) { struct crypto_alg *q, *alg = NULL; int best = -2; list_for_each_entry(q, &crypto_alg_list, cra_list) { int exact, fuzzy; if (crypto_is_moribund(q)) continue; if ((q->cra_flags ^ type) & mask) continue; exact = !strcmp(q->cra_driver_name, name); fuzzy = !strcmp(q->cra_name, name); if (!exact && !(fuzzy && q->cra_priority > best)) continue; if (unlikely(!crypto_mod_get(q))) continue; best = q->cra_priority; if (alg) crypto_mod_put(alg); alg = q; if (exact) break; } return alg; } static void crypto_larval_destroy(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; BUG_ON(!crypto_is_larval(alg)); if (!IS_ERR_OR_NULL(larval->adult)) crypto_mod_put(larval->adult); kfree(larval); } struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask) { struct crypto_larval *larval; larval = kzalloc_obj(*larval); if (!larval) return ERR_PTR(-ENOMEM); type &= ~CRYPTO_ALG_TYPE_MASK | (mask ?: CRYPTO_ALG_TYPE_MASK); larval->mask = mask; larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type; larval->alg.cra_priority = -1; larval->alg.cra_destroy = crypto_larval_destroy; strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME); init_completion(&larval->completion); return larval; } EXPORT_SYMBOL_GPL(crypto_larval_alloc); static struct crypto_alg *crypto_larval_add(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_larval *larval; larval = crypto_larval_alloc(name, type, mask); if (IS_ERR(larval)) return ERR_CAST(larval); refcount_set(&larval->alg.cra_refcnt, 2); down_write(&crypto_alg_sem); alg = __crypto_alg_lookup(name, type, mask); if (!alg) { alg = &larval->alg; list_add(&alg->cra_list, &crypto_alg_list); } up_write(&crypto_alg_sem); if (alg != &larval->alg) { kfree(larval); if (crypto_is_larval(alg)) alg = crypto_larval_wait(alg, type, mask); } return alg; } static void crypto_larval_kill(struct crypto_larval *larval) { bool unlinked; down_write(&crypto_alg_sem); unlinked = list_empty(&larval->alg.cra_list); if (!unlinked) list_del_init(&larval->alg.cra_list); up_write(&crypto_alg_sem); if (unlinked) return; complete_all(&larval->completion); crypto_alg_put(&larval->alg); } void crypto_schedule_test(struct crypto_larval *larval) { int err; err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); WARN_ON_ONCE(err != NOTIFY_STOP); } EXPORT_SYMBOL_GPL(crypto_schedule_test); static void crypto_start_test(struct crypto_larval *larval) { if (!crypto_is_test_larval(larval)) return; if (larval->test_started) return; down_write(&crypto_alg_sem); if (larval->test_started) { up_write(&crypto_alg_sem); return; } larval->test_started = true; up_write(&crypto_alg_sem); crypto_schedule_test(larval); } static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg, u32 type, u32 mask) { struct crypto_larval *larval; long time_left; again: larval = container_of(alg, struct crypto_larval, alg); if (!crypto_boot_test_finished()) crypto_start_test(larval); time_left = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; if (time_left < 0) alg = ERR_PTR(-EINTR); else if (!time_left) { if (crypto_is_test_larval(larval)) crypto_larval_kill(larval); alg = ERR_PTR(-ETIMEDOUT); } else if (!alg || PTR_ERR(alg) == -EEXIST) { int err = alg ? -EEXIST : -EAGAIN; /* * EEXIST is expected because two probes can be scheduled * at the same time with one using alg_name and the other * using driver_name. Do a re-lookup but do not retry in * case we hit a quirk like gcm_base(ctr(aes),...) which * will never match. */ alg = &larval->alg; alg = crypto_alg_lookup(alg->cra_name, type, mask) ?: ERR_PTR(err); } else if (IS_ERR(alg)) ; else if (crypto_is_test_larval(larval) && !(alg->cra_flags & CRYPTO_ALG_TESTED)) alg = ERR_PTR(-EAGAIN); else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) alg = ERR_PTR(-EAGAIN); else if (!crypto_mod_get(alg)) alg = ERR_PTR(-EAGAIN); crypto_mod_put(&larval->alg); if (!IS_ERR(alg) && crypto_is_larval(alg)) goto again; return alg; } static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask) { const u32 fips = CRYPTO_ALG_FIPS_INTERNAL; struct crypto_alg *alg; u32 test = 0; if (!((type | mask) & CRYPTO_ALG_TESTED)) test |= CRYPTO_ALG_TESTED; down_read(&crypto_alg_sem); alg = __crypto_alg_lookup(name, (type | test) & ~fips, (mask | test) & ~fips); if (alg) { if (((type | mask) ^ fips) & fips) mask |= fips; mask &= fips; if (!crypto_is_larval(alg) && ((type ^ alg->cra_flags) & mask)) { /* Algorithm is disallowed in FIPS mode. */ crypto_mod_put(alg); alg = ERR_PTR(-ENOENT); } } else if (test) { alg = __crypto_alg_lookup(name, type, mask); if (alg && !crypto_is_larval(alg)) { /* Test failed */ crypto_mod_put(alg); alg = ERR_PTR(-ELIBBAD); } } up_read(&crypto_alg_sem); return alg; } static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; if (!name) return ERR_PTR(-ENOENT); type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); alg = crypto_alg_lookup(name, type, mask); if (!alg && !(mask & CRYPTO_NOLOAD)) { request_module("crypto-%s", name); if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask & CRYPTO_ALG_NEED_FALLBACK)) request_module("crypto-%s-all", name); alg = crypto_alg_lookup(name, type, mask); } if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg)) alg = crypto_larval_wait(alg, type, mask); else if (alg) ; else if (!(mask & CRYPTO_ALG_TESTED)) alg = crypto_larval_add(name, type, mask); else alg = ERR_PTR(-ENOENT); return alg; } int crypto_probing_notify(unsigned long val, void *v) { int ok; ok = blocking_notifier_call_chain(&crypto_chain, val, v); if (ok == NOTIFY_DONE) { request_module("cryptomgr"); ok = blocking_notifier_call_chain(&crypto_chain, val, v); } return ok; } EXPORT_SYMBOL_GPL(crypto_probing_notify); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_alg *larval; int ok; /* * If the internal flag is set for a cipher, require a caller to * invoke the cipher with the internal flag to use that cipher. * Also, if a caller wants to allocate a cipher that may or may * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and * !(mask & CRYPTO_ALG_INTERNAL). */ if (!((type | mask) & CRYPTO_ALG_INTERNAL)) mask |= CRYPTO_ALG_INTERNAL; larval = crypto_larval_lookup(name, type, mask); if (IS_ERR(larval) || !crypto_is_larval(larval)) return larval; ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval); if (ok == NOTIFY_STOP) alg = crypto_larval_wait(larval, type, mask); else { crypto_mod_put(larval); alg = ERR_PTR(-ENOENT); } crypto_larval_kill(container_of(larval, struct crypto_larval, alg)); return alg; } EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup); static void crypto_exit_ops(struct crypto_tfm *tfm) { const struct crypto_type *type = tfm->__crt_alg->cra_type; if (type && tfm->exit) tfm->exit(tfm); } static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) { const struct crypto_type *type_obj = alg->cra_type; unsigned int len; len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1); if (type_obj) return len + type_obj->ctxsize(alg, type, mask); switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { default: BUG(); case CRYPTO_ALG_TYPE_CIPHER: len += crypto_cipher_ctxsize(alg); break; } return len; } void crypto_shoot_alg(struct crypto_alg *alg) { down_write(&crypto_alg_sem); alg->cra_flags |= CRYPTO_ALG_DYING; up_write(&crypto_alg_sem); } EXPORT_SYMBOL_GPL(crypto_shoot_alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfm_size; int err = -ENOMEM; tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask); tfm = kzalloc(tfm_size, gfp); if (tfm == NULL) goto out_err; tfm->__crt_alg = alg; refcount_set(&tfm->refcnt, 1); if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(tfm); out_err: tfm = ERR_PTR(err); out: return tfm; } EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask) { return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__crypto_alloc_tfm); /* * crypto_alloc_base - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @type: Type of algorithm * @mask: Mask for type comparison * * This function should not be used by new algorithm types. * Please use crypto_alloc_tfm instead. * * crypto_alloc_base() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) { struct crypto_tfm *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_alg_mod_lookup(alg_name, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = __crypto_alloc_tfm(alg, type, mask); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_base); static void *crypto_alloc_tfmmem(struct crypto_alg *alg, const struct crypto_type *frontend, int node, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfmsize; unsigned int total; char *mem; tfmsize = frontend->tfmsize; total = tfmsize + sizeof(*tfm) + frontend->extsize(alg); mem = kzalloc_node(total, gfp, node); if (mem == NULL) return ERR_PTR(-ENOMEM); tfm = (struct crypto_tfm *)(mem + tfmsize); tfm->__crt_alg = alg; tfm->node = node; refcount_set(&tfm->refcnt, 1); return mem; } void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node) { struct crypto_tfm *tfm; char *mem; int err; mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL); if (IS_ERR(mem)) goto out; tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->fb = tfm; err = frontend->init_tfm(tfm); if (err) goto out_free_tfm; if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); out_free_tfm: if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(mem); mem = ERR_PTR(err); out: return mem; } EXPORT_SYMBOL_GPL(crypto_create_tfm_node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm) { struct crypto_alg *alg = otfm->__crt_alg; struct crypto_tfm *tfm; char *mem; mem = ERR_PTR(-ESTALE); if (unlikely(!crypto_mod_get(alg))) goto out; mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC); if (IS_ERR(mem)) { crypto_mod_put(alg); goto out; } tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->crt_flags = otfm->crt_flags; tfm->fb = tfm; out: return mem; } EXPORT_SYMBOL_GPL(crypto_clone_tfm); struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { if (frontend) { type &= frontend->maskclear; mask &= frontend->maskclear; type |= frontend->type; mask |= frontend->maskset; } return crypto_alg_mod_lookup(alg_name, type, mask); } EXPORT_SYMBOL_GPL(crypto_find_alg); /* * crypto_alloc_tfm_node - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @frontend: Frontend algorithm type * @type: Type of algorithm * @mask: Mask for type comparison * @node: NUMA node in which users desire to put requests, if node is * NUMA_NO_NODE, it means users have no special requirement. * * crypto_alloc_tfm() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node) { void *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_find_alg(alg_name, frontend, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = crypto_create_tfm_node(alg, frontend, node); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node); /* * crypto_destroy_tfm - Free crypto transform * @mem: Start of tfm slab * @tfm: Transform to free * * This function frees up the transform and any associated resources, * then drops the refcount on the associated algorithm. */ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; if (IS_ERR_OR_NULL(mem)) return; if (!refcount_dec_and_test(&tfm->refcnt)) return; alg = tfm->__crt_alg; if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); kfree_sensitive(mem); } EXPORT_SYMBOL_GPL(crypto_destroy_tfm); int crypto_has_alg(const char *name, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_has_alg); void crypto_req_done(void *data, int err) { struct crypto_wait *wait = data; if (err == -EINPROGRESS) return; wait->err = err; complete(&wait->completion); } EXPORT_SYMBOL_GPL(crypto_req_done); void crypto_destroy_alg(struct crypto_alg *alg) { if (alg->cra_type && alg->cra_type->destroy) alg->cra_type->destroy(alg); if (alg->cra_destroy) alg->cra_destroy(alg); } EXPORT_SYMBOL_GPL(crypto_destroy_alg); struct crypto_async_request *crypto_request_clone( struct crypto_async_request *req, size_t total, gfp_t gfp) { struct crypto_tfm *tfm = req->tfm; struct crypto_async_request *nreq; nreq = kmemdup(req, total, gfp); if (!nreq) { req->tfm = tfm->fb; return req; } nreq->flags &= ~CRYPTO_TFM_REQ_ON_STACK; return nreq; } EXPORT_SYMBOL_GPL(crypto_request_clone); MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL");
1 1 1 1 1 24 23 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 22 23 23 23 1251 1249 1 1 93 93 2402 2405 139 137 4 1 1 1 1 1 125 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/rculist.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/dst_metadata.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <net/netns/generic.h> #include <net/inet_dscp.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #endif static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_hdr; #endif /* default value is 1 */ int call_iptables; int call_ip6tables; int call_arptables; /* default value is 0 */ int filter_vlan_tagged; int filter_pppoe_tagged; int pass_vlan_indev; }; #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) #define IS_IPV6(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) #define IS_ARP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) static inline __be16 vlan_proto(const struct sk_buff *skb) { if (skb_vlan_tag_present(skb)) return skb->protocol; else if (skb->protocol == htons(ETH_P_8021Q)) return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; else return 0; } static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged; } static inline bool is_vlan_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_IPV6) && brnet->filter_vlan_tagged; } static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged; } static inline __be16 pppoe_proto(const struct sk_buff *skb) { return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + sizeof(struct pppoe_hdr))); } static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged; } static inline bool is_pppoe_ipv6(const struct sk_buff *skb, const struct net *net) { struct brnf_net *brnet = net_generic(net, brnf_net_id); return skb->protocol == htons(ETH_P_PPP_SES) && pppoe_proto(skb) == htons(PPP_IPV6) && brnet->filter_pppoe_tagged; } /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) struct brnf_frag_data { local_lock_t bh_lock; char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH]; u8 encap_size; u8 size; u16 vlan_tci; __be16 vlan_proto; }; static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static void nf_bridge_info_free(struct sk_buff *skb) { skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) { struct net_bridge_port *port; port = br_port_get_rcu(dev); return port ? port->br->dev : NULL; } static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) { switch (skb->protocol) { case __cpu_to_be16(ETH_P_8021Q): return VLAN_HLEN; case __cpu_to_be16(ETH_P_PPP_SES): return PPPOE_SES_HLEN; default: return 0; } } static inline void nf_bridge_pull_encap_header(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull(skb, len); skb->network_header += len; } static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb) { unsigned int len = nf_bridge_encap_header_len(skb); skb_pull_rcsum(skb, len); skb->network_header += len; } /* When handing a packet over to the IP layer * check whether we have a skb that is in the * expected format */ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) { const struct iphdr *iph; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* Basic sanity checks */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); /* We should really parse IP options here but until * somebody who actually uses IP options complains to * us we'll just silently ignore the options because * we're lazy! */ return 0; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } void nf_bridge_update_protocol(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; case BRNF_PROTO_PPPOE: skb->protocol = htons(ETH_P_PPP_SES); break; case BRNF_PROTO_UNCHANGED: break; } } /* Obtain the correct destination MAC address, while preserving the original * source MAC address. If we already know this address, we just copy it. If we * don't, we use the neighbour framework to find out. In both cases, we make * sure that br_handle_frame_finish() is called afterwards. */ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb) { struct neighbour *neigh; struct dst_entry *dst; skb->dev = bridge_parent(skb->dev); if (!skb->dev) goto free_skb; dst = skb_dst(skb); neigh = dst_neigh_lookup_skb(dst, skb); if (neigh) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); int ret; if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) && READ_ONCE(neigh->hh.hh_len)) { struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { neigh_release(neigh); goto free_skb; } neigh_hh_bridge(&neigh->hh, skb); skb->dev = br_indev; ret = br_handle_frame_finish(net, sk, skb); } else { /* the neighbour function below overwrites the complete * MAC header, so we save the Ethernet source address and * protocol number. */ skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = READ_ONCE(neigh->output)(neigh, skb); } neigh_release(neigh); return ret; } free_skb: kfree_skb(skb); return 0; } static inline bool br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr; } /* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); enum skb_drop_reason reason; struct rtable *rt; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { kfree_skb(skb); return 0; } nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { reason = ip_route_input(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (reason) { kfree_skb_reason(skb, reason); return 0; } else { if (skb_dst(skb)->dev == dev) { skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev, const struct net *net) { struct net_device *vlan, *br; struct brnf_net *brnet = net_generic(net, brnf_net_id); br = bridge_parent(dev); if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, skb_vlan_tag_get(skb) & VLAN_VID_MASK); return vlan ? vlan : br; } /* Some common code for IPv4/IPv6 */ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge->in_prerouting = 1; nf_bridge->physinif = skb->dev->ifindex; skb->dev = brnf_get_logical_dev(skb, skb->dev, net); if (skb->protocol == htons(ETH_P_8021Q)) nf_bridge->orig_proto = BRNF_PROTO_8021Q; else if (skb->protocol == htons(ETH_P_PPP_SES)) nf_bridge->orig_proto = BRNF_PROTO_PPPOE; /* Must drop socket now because of tproxy. */ skb_orphan(skb); return skb->dev; } /* Direct IPv6 traffic to br_nf_pre_routing_ipv6. * Replicate the checks that IPv4 does on packet reception. * Set skb->dev to the bridge device (i.e. parent of the * receiving device) to make netfilter happy, the REDIRECT * target in particular. Save the original destination IP * address to be able to detect DNAT afterwards. */ static unsigned int br_nf_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; struct net_bridge_port *p; struct net_bridge *br; __u32 len = nf_bridge_encap_header_len(skb); struct brnf_net *brnet; if (unlikely(!pskb_may_pull(skb, len))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); p = br_port_get_rcu(state->in); if (p == NULL) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); br = p->br; brnet = net_generic(state->net, brnf_net_id); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) { if (!brnet->call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; if (!ipv6_mod_enabled()) { pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported."); return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0); } nf_bridge_pull_encap_header_rcsum(skb); return br_nf_pre_routing_ipv6(priv, skb, state); } if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) && !is_pppoe_ip(skb, state->net)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); if (!nf_bridge_alloc(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); if (!setup_pre_routing(skb, state->net)) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IP); skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4; NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish); return NF_STOLEN; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* conntracks' nf_confirm logic cannot handle cloned skbs referencing * the same nf_conn entry, which will happen for multicast (broadcast) * Frames on bridges. * * Example: * macvlan0 * br0 * ethX ethY * * ethX (or Y) receives multicast or broadcast packet containing * an IP packet, not yet in conntrack table. * * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. * -> skb->_nfct now references a unconfirmed entry * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge * interface. * 3. skb gets passed up the stack. * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb * and schedules a work queue to send them out on the lower devices. * * The clone skb->_nfct is not a copy, it is the same entry as the * original skb. The macvlan rx handler then returns RX_HANDLER_PASS. * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. * * The Macvlan broadcast worker and normal confirm path will race. * * This race will not happen if step 2 already confirmed a clone. In that * case later steps perform skb_clone() with skb->_nfct already confirmed (in * hash table). This works fine. * * But such confirmation won't happen when eb/ip/nftables rules dropped the * packets before they reached the nf_confirm step in postrouting. * * Work around this problem by explicit confirmation of the entry at * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed * entry. * */ static unsigned int br_nf_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { bool promisc = BR_INPUT_SKB_CB(skb)->promisc; struct nf_conntrack *nfct = skb_nfct(skb); const struct nf_ct_hook *ct_hook; struct nf_conn *ct; int ret; if (promisc) { nf_reset_ct(skb); return NF_ACCEPT; } if (!nfct || skb->pkt_type == PACKET_HOST) return NF_ACCEPT; ct = container_of(nfct, struct nf_conn, ct_general); if (likely(nf_ct_is_confirmed(ct))) return NF_ACCEPT; if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) { nf_reset_ct(skb); return NF_ACCEPT; } WARN_ON_ONCE(skb_shared(skb)); /* We can't call nf_confirm here, it would create a dependency * on nf_conntrack module. */ ct_hook = rcu_dereference(nf_ct_hook); if (!ct_hook) { skb->_nfct = 0ul; nf_conntrack_put(nfct); return NF_ACCEPT; } nf_bridge_pull_encap_header(skb); ret = ct_hook->confirm(skb); switch (ret & NF_VERDICT_MASK) { case NF_STOLEN: return NF_STOLEN; default: nf_bridge_push_encap_header(skb); break; } return ret; } #endif /* PF_BRIDGE/FORWARD *************************************************/ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *in; if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) { if (skb->protocol == htons(ETH_P_IP)) nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (skb->protocol == htons(ETH_P_IPV6)) nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; in = nf_bridge_get_physindev(skb, net); if (!in) { kfree_skb(skb); return 0; } if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge_update_protocol(skb); } else { in = *((struct net_device **)(skb->cb)); } nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev, br_forward_finish); return 0; } static unsigned int br_nf_forward_ip(struct sk_buff *skb, const struct nf_hook_state *state, u8 pf) { struct nf_bridge_info *nf_bridge; struct net_device *parent; nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple * different physoutdevs. */ if (!nf_bridge_unshare(skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); nf_bridge = nf_bridge_info_get(skb); if (!nf_bridge) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); parent = bridge_parent(state->out); if (!parent) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge_pull_encap_header(skb); if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } if (pf == NFPROTO_IPV4) { if (br_validate_ipv4(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IP); } else if (pf == NFPROTO_IPV6) { if (br_validate_ipv6(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; skb->protocol = htons(ETH_P_IPV6); } else { WARN_ON_ONCE(1); return NF_DROP; } nf_bridge->physoutdev = skb->dev; NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb, brnf_get_logical_dev(skb, state->in, state->net), parent, br_nf_forward_finish); return NF_STOLEN; } static unsigned int br_nf_forward_arp(struct sk_buff *skb, const struct nf_hook_state *state) { struct net_bridge_port *p; struct net_bridge *br; struct net_device **d = (struct net_device **)(skb->cb); struct brnf_net *brnet; p = br_port_get_rcu(state->out); if (p == NULL) return NF_ACCEPT; br = p->br; brnet = net_generic(state->net, brnf_net_id); if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (is_vlan_arp(skb, state->net)) nf_bridge_pull_encap_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr)))) return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0); if (arp_hdr(skb)->ar_pln != 4) { if (is_vlan_arp(skb, state->net)) nf_bridge_push_encap_header(skb); return NF_ACCEPT; } *d = state->in; NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb, state->in, state->out, br_nf_forward_finish); return NF_STOLEN; } /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev * because of the physdev module. For ARP, indev and outdev are the * bridge ports. */ static unsigned int br_nf_forward(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV4); if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) return br_nf_forward_ip(skb, state, NFPROTO_IPV6); if (IS_ARP(skb) || is_vlan_arp(skb, state->net)) return br_nf_forward_arp(skb, state); return NF_ACCEPT; } static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct brnf_frag_data *data; int err; data = this_cpu_ptr(&brnf_frag_data_storage); err = skb_cow_head(skb, data->size); if (err) { kfree_skb(skb); return 0; } if (data->vlan_proto) __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size); __skb_push(skb, data->encap_size); nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { unsigned int mtu = ip_skb_dst_mtu(sk, skb); struct iphdr *iph = ip_hdr(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu) mtu = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } /* Fragmentation on metadata/template dst is not supported */ if (unlikely(!skb_valid_dst(skb))) goto drop; /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; } else { data->vlan_proto = 0; } data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); if (v6ops) { ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); return ret; } local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; } nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; } /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *realoutdev = bridge_parent(skb->dev); u_int8_t pf; /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in * on a bridge, but was delivered locally and is now being routed: * * POST_ROUTING was already invoked from the ip stack. */ if (!nf_bridge || !nf_bridge->physoutdev) return NF_ACCEPT; if (!realoutdev) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); if (IS_IP(skb) || is_vlan_ip(skb, state->net) || is_pppoe_ip(skb, state->net)) pf = NFPROTO_IPV4; else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) || is_pppoe_ipv6(skb, state->net)) pf = NFPROTO_IPV6; else return NF_ACCEPT; if (skb->pkt_type == PACKET_OTHERHOST) { skb->pkt_type = PACKET_HOST; nf_bridge->pkt_otherhost = true; } nf_bridge_pull_encap_header(skb); if (pf == NFPROTO_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb, NULL, realoutdev, br_nf_dev_queue_xmit); return NF_STOLEN; } /* IP/SABOTAGE *****************************************************/ /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING * for the second time. */ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { if (nf_bridge->sabotage_in_done) return NF_ACCEPT; if (!nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { nf_bridge->sabotage_in_done = 1; state->okfn(state->net, state->sk, skb); return NF_STOLEN; } } return NF_ACCEPT; } /* This is called when br_netfilter has called into iptables/netfilter, * and DNAT has taken place on a bridge-forwarded packet. * * neigh->output has created a new MAC header, with local br0 MAC * as saddr. * * This restores the original MAC saddr of the bridged packet * before invoking bridge forward logic to transmit the packet. */ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev)); if (!br_indev) { kfree_skb(skb); return; } skb_pull(skb, ETH_HLEN); nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN), nf_bridge->neigh_header, ETH_HLEN - ETH_ALEN); skb->dev = br_indev; nf_bridge->physoutdev = NULL; br_handle_frame_finish(dev_net(skb->dev), NULL, skb); } static int br_nf_dev_xmit(struct sk_buff *skb) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } return 0; } static const struct nf_br_ops br_ops = { .br_dev_xmit_hook = br_nf_dev_xmit, }; /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because * br_dev_queue_push_xmit is called afterwards */ static const struct nf_hook_ops br_nf_ops[] = { { .hook = br_nf_pre_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_BRNF, }, #if IS_ENABLED(CONFIG_NF_CONNTRACK) { .hook = br_nf_local_in, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_LAST, }, #endif { .hook = br_nf_forward, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_BRNF, }, { .hook = br_nf_post_routing, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_LAST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_FIRST, }, { .hook = ip_sabotage_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_FIRST, }, }; static int brnf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct brnf_net *brnet; struct net *net; int ret; if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev)) return NOTIFY_DONE; ASSERT_RTNL(); net = dev_net(dev); brnet = net_generic(net, brnf_net_id); if (brnet->enabled) return NOTIFY_OK; ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); if (ret) return NOTIFY_BAD; brnet->enabled = true; return NOTIFY_OK; } static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; /* recursively invokes nf_hook_slow (again), skipping already-called * hooks (< NF_BR_PRI_BRNF). * * Called with rcu read lock held. */ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { const struct nf_hook_entries *e; struct nf_hook_state state; struct nf_hook_ops **ops; unsigned int i; int ret; e = rcu_dereference(net->nf.hooks_bridge[hook]); if (!e) return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); for (i = 0; i < e->num_hook_entries; i++) { /* These hooks have already been called */ if (ops[i]->priority < NF_BR_PRI_BRNF) continue; /* These hooks have not been called yet, run them. */ if (ops[i]->priority > NF_BR_PRI_BRNF) break; /* take a closer look at NF_BR_PRI_BRNF. */ if (ops[i]->hook == br_nf_pre_routing) { /* This hook diverted the skb to this function, * hooks after this have not been run yet. */ i++; break; } } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, e, i); if (ret == 1) ret = okfn(net, sk, skb); return ret; } #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; return ret; } static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) { brnf->call_iptables = 1; brnf->call_ip6tables = 1; brnf->call_arptables = 1; brnf->filter_vlan_tagged = 0; brnf->filter_pppoe_tagged = 0; brnf->pass_vlan_indev = 0; } static int br_netfilter_sysctl_init_net(struct net *net) { struct ctl_table *table = brnf_table; struct brnf_net *brnet; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL); if (!table) return -ENOMEM; } brnet = net_generic(net, brnf_net_id); table[0].data = &brnet->call_arptables; table[1].data = &brnet->call_iptables; table[2].data = &brnet->call_ip6tables; table[3].data = &brnet->filter_vlan_tagged; table[4].data = &brnet->filter_pppoe_tagged; table[5].data = &brnet->pass_vlan_indev; br_netfilter_sysctl_default(brnet); brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table, ARRAY_SIZE(brnf_table)); if (!brnet->ctl_hdr) { if (!net_eq(net, &init_net)) kfree(table); return -ENOMEM; } return 0; } static void br_netfilter_sysctl_exit_net(struct net *net, struct brnf_net *brnet) { const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(brnet->ctl_hdr); if (!net_eq(net, &init_net)) kfree(table); } static int __net_init brnf_init_net(struct net *net) { return br_netfilter_sysctl_init_net(net); } #endif static void __net_exit brnf_exit_net(struct net *net) { struct brnf_net *brnet; brnet = net_generic(net, brnf_net_id); if (brnet->enabled) { nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); brnet->enabled = false; } #ifdef CONFIG_SYSCTL br_netfilter_sysctl_exit_net(net, brnet); #endif } static struct pernet_operations brnf_net_ops __read_mostly = { #ifdef CONFIG_SYSCTL .init = brnf_init_net, #endif .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), }; static int __init br_netfilter_init(void) { int ret; ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; ret = register_netdevice_notifier(&brnf_notifier); if (ret < 0) { unregister_pernet_subsys(&brnf_net_ops); return ret; } RCU_INIT_POINTER(nf_br_ops, &br_ops); printk(KERN_NOTICE "Bridge firewalling registered\n"); return 0; } static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); } module_init(br_netfilter_init); module_exit(br_netfilter_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>"); MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");
1 14 2 2 14 13 1 2 2 2 2 4 4 1 1 1 4 4 4 11 2 1 14 14 1 13 2 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Locality-Based Least-Connection scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Changes: * Martin Hamilton : fixed the terrible locking bugs * *lock(tbl->lock) ==> *lock(&tbl->lock) * Wensong Zhang : fixed the uninitialized tbl->lock bug * Wensong Zhang : added doing full expiration check to * collect stale entries of 24+ hours when * no partial expire check in a half hour * Julian Anastasov : replaced del_timer call with del_timer_sync * to avoid the possible race between timer * handler and del_timer thread in SMP */ /* * The lblc algorithm is as follows (pseudo code): * * if cachenode[dest_ip] is null then * n, cachenode[dest_ip] <- {weighted least-conn node}; * else * n <- cachenode[dest_ip]; * if (n is dead) OR * (n.conns>n.weight AND * there is a node m with m.conns<m.weight/2) then * n, cachenode[dest_ip] <- {weighted least-conn node}; * * return n; * * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing * me to write this module. */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> #include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> #include <linux/sysctl.h> #include <net/ip_vs.h> /* * It is for garbage collection of stale IPVS lblc entries, * when the table is full. */ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) #define DEFAULT_EXPIRATION (24*60*60*HZ) /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) * in a half hour, do a full expiration check to collect stale * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 /* * for IPVS lblc entry hash table */ #ifndef CONFIG_IP_VS_LBLC_TAB_BITS #define CONFIG_IP_VS_LBLC_TAB_BITS 10 #endif #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) /* * IPVS lblc entry represents an association between destination * IP address and its destination server */ struct ip_vs_lblc_entry { struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest *dest; /* real server (cache) */ unsigned long lastuse; /* last used time */ struct rcu_head rcu_head; }; /* * IPVS lblc hash table */ struct ip_vs_lblc_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ struct timer_list periodic_timer; /* collect stale entries */ struct ip_vs_service *svc; /* pointer back to service */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; }; /* * IPVS LBLC sysctl table */ #ifdef CONFIG_SYSCTL static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; #endif static void ip_vs_lblc_rcu_free(struct rcu_head *head) { struct ip_vs_lblc_entry *en = container_of(head, struct ip_vs_lblc_entry, rcu_head); ip_vs_dest_put_and_free(en->dest); kfree(en); } static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) { hlist_del_rcu(&en->list); call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); } /* * Returns hash value for IPVS LBLC entry */ static inline unsigned int ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } /* * Hash an entry in the ip_vs_lblc_table. * returns bool success. */ static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) { unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } /* Get ip_vs_lblc_entry associated with supplied parameters. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr) { unsigned int hash = ip_vs_lblc_hashkey(af, addr); struct ip_vs_lblc_entry *en; hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; return NULL; } /* * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP * address to a server. Called under spin lock. */ static inline struct ip_vs_lblc_entry * ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblc_entry *en; en = ip_vs_lblc_get(af, tbl, daddr); if (en) { if (en->dest == dest) return en; ip_vs_lblc_del(en); } en = kmalloc_obj(*en, GFP_ATOMIC); if (!en) return NULL; en->af = af; ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; ip_vs_dest_hold(dest); en->dest = dest; ip_vs_lblc_hash(tbl, en); return en; } /* * Flush all the entries of the specified table. */ static void ip_vs_lblc_flush(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_lblc_entry *en; struct hlist_node *next; int i; spin_lock_bh(&svc->sched_lock); tbl->dead = true; for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } } spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblc_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL return svc->ipvs->sysctl_lblc_expiration; #else return DEFAULT_EXPIRATION; #endif } static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_lblc_entry *en; struct hlist_node *next; unsigned long now = jiffies; int i, j; for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc))) continue; ip_vs_lblc_del(en); atomic_dec(&tbl->entries); } spin_unlock(&svc->sched_lock); } tbl->rover = j; } /* * Periodical timer handler for IPVS lblc table * It is used to collect stale entries when the number of entries * exceeds the maximum size of the table. * * Fixme: we probably need more complicated algorithm to collect * entries that have not been used for a long time even * if the number of entries doesn't exceed the maximum size * of the table. * The full expiration check is for this purpose now. */ static void ip_vs_lblc_check_expire(struct timer_list *t) { struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t, periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblc_entry *en; struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ ip_vs_lblc_full_check(svc); tbl->counter = 1; goto out; } if (atomic_read(&tbl->entries) <= tbl->max_size) { tbl->counter++; goto out; } goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; if (goal > tbl->max_size/2) goal = tbl->max_size/2; for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) continue; ip_vs_lblc_del(en); atomic_dec(&tbl->entries); goal--; } spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); } static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) { int i; struct ip_vs_lblc_table *tbl; /* * Allocate the ip_vs_lblc_table for this service */ tbl = kmalloc_obj(*tbl); if (tbl == NULL) return -ENOMEM; svc->sched_data = tbl; IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets */ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; tbl->dead = false; tbl->svc = svc; atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection */ timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblc_table *tbl = svc->sched_data; /* remove periodic timer */ timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblc_flush(svc); /* release the table itself */ kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } static inline struct ip_vs_dest * __ip_vs_lblc_schedule(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *least; int loh, doh; /* * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connection. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "LBLC: server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* * If this destination server is overloaded and there is a less loaded * server, then return true. */ static inline int is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) { if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; } } } return 0; } /* * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_lblc_table *tbl = svc->sched_data; struct ip_vs_dest *dest = NULL; struct ip_vs_lblc_entry *en; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); if (en) { /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; /* * If the destination is not available, i.e. it's in the trash, * we must ignore it, as it may be removed from under our feet, * if someone drops our reference count. Our caller only makes * sure that destinations, that are not in the trash, are not * moved to the trash, while we are scheduling. But anyone can * free up entries from the trash at any time. */ dest = en->dest; if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) goto out; } /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS LBLC Scheduler structure */ static struct ip_vs_scheduler ip_vs_lblc_scheduler = { .name = "lblc", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), .init_service = ip_vs_lblc_init_svc, .done_service = ip_vs_lblc_done_svc, .schedule = ip_vs_lblc_schedule, }; /* * per netns init. */ #ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblc_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; if (!net_eq(net, &init_net)) { ipvs->lblc_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblc_ctl_table == NULL) return -ENOMEM; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) vars_table_size = 0; } else ipvs->lblc_ctl_table = vs_vars_table; ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", ipvs->lblc_ctl_table, vars_table_size); if (!ipvs->lblc_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); return -ENOMEM; } return 0; } static void __net_exit __ip_vs_lblc_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); unregister_net_sysctl_table(ipvs->lblc_ctl_header); if (!net_eq(net, &init_net)) kfree(ipvs->lblc_ctl_table); } #else static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } static void __net_exit __ip_vs_lblc_exit(struct net *net) { } #endif static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, }; static int __init ip_vs_lblc_init(void) { int ret; ret = register_pernet_subsys(&ip_vs_lblc_ops); if (ret) return ret; ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) unregister_pernet_subsys(&ip_vs_lblc_ops); return ret; } static void __exit ip_vs_lblc_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); unregister_pernet_subsys(&ip_vs_lblc_ops); rcu_barrier(); } module_init(ip_vs_lblc_init); module_exit(ip_vs_lblc_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler");
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); u8 eth_addr[ETH_ALEN] = {}; unsigned int port_index; bool addr_set = false; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = sscanf(buf, "%u %hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &port_index, &eth_addr[0], &eth_addr[1], &eth_addr[2], &eth_addr[3], &eth_addr[4], &eth_addr[5]); switch (ret) { case 7: if (!is_valid_ether_addr(eth_addr)) { pr_err("The supplied perm_addr is not a valid MAC address\n"); return -EINVAL; } addr_set = true; fallthrough; case 1: break; default: pr_err("Format for adding new port is \"id [perm_addr]\" (uint MAC).\n"); return -EINVAL; } ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index, addr_set ? eth_addr : NULL); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static const struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static ssize_t link_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim_a, *nsim_b, *peer; struct net_device *dev_a, *dev_b; unsigned int ifidx_a, ifidx_b; int netnsfd_a, netnsfd_b, err; struct net *ns_a, *ns_b; err = sscanf(buf, "%d:%u %d:%u", &netnsfd_a, &ifidx_a, &netnsfd_b, &ifidx_b); if (err != 4) { pr_err("Format for linking two devices is \"netnsfd_a:ifidx_a netnsfd_b:ifidx_b\" (int uint int uint).\n"); return -EINVAL; } ns_a = get_net_ns_by_fd(netnsfd_a); if (IS_ERR(ns_a)) { pr_err("Could not find netns with fd: %d\n", netnsfd_a); return -EINVAL; } ns_b = get_net_ns_by_fd(netnsfd_b); if (IS_ERR(ns_b)) { pr_err("Could not find netns with fd: %d\n", netnsfd_b); put_net(ns_a); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev_a = __dev_get_by_index(ns_a, ifidx_a); if (!dev_a) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_a, netnsfd_a); goto out_err; } if (!netdev_is_nsim(dev_a)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_a, netnsfd_a); goto out_err; } dev_b = __dev_get_by_index(ns_b, ifidx_b); if (!dev_b) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_b, netnsfd_b); goto out_err; } if (!netdev_is_nsim(dev_b)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_b, netnsfd_b); goto out_err; } if (dev_a == dev_b) { pr_err("Cannot link a netdevsim to itself\n"); goto out_err; } err = -EBUSY; nsim_a = netdev_priv(dev_a); peer = rtnl_dereference(nsim_a->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_a, ifidx_a); goto out_err; } nsim_b = netdev_priv(dev_b); peer = rtnl_dereference(nsim_b->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_b, ifidx_b); goto out_err; } err = 0; rcu_assign_pointer(nsim_a->peer, nsim_b); rcu_assign_pointer(nsim_b->peer, nsim_a); if (netif_running(dev_a) && netif_running(dev_b)) { netif_carrier_on(dev_a); netif_carrier_on(dev_b); } out_err: put_net(ns_b); put_net(ns_a); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(link_device); static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim, *peer; struct net_device *dev; unsigned int ifidx; int netnsfd, err; struct net *ns; err = sscanf(buf, "%u:%u", &netnsfd, &ifidx); if (err != 2) { pr_err("Format for unlinking a device is \"netnsfd:ifidx\" (int uint).\n"); return -EINVAL; } ns = get_net_ns_by_fd(netnsfd); if (IS_ERR(ns)) { pr_err("Could not find netns with fd: %d\n", netnsfd); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev = __dev_get_by_index(ns, ifidx); if (!dev) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx, netnsfd); goto out_put_netns; } if (!netdev_is_nsim(dev)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx, netnsfd); goto out_put_netns; } nsim = netdev_priv(dev); peer = rtnl_dereference(nsim->peer); if (!peer) goto out_put_netns; netif_carrier_off(dev); netif_carrier_off(peer->netdev); err = 0; RCU_INIT_POINTER(nsim->peer, NULL); RCU_INIT_POINTER(peer->peer, NULL); synchronize_net(); netif_tx_wake_all_queues(dev); netif_tx_wake_all_queues(peer->netdev); out_put_netns: put_net(ns); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(unlink_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, &bus_attr_link_device.attr, &bus_attr_unlink_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static const struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc_obj(*nsim_bus_dev); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); }
2402 2402 514 125 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/netdev_lock.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc_obj(*me, flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { unsigned int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_dstats_rx_add(dev, len); else dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int len = skb->len; netdev_tx_t ret; ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; skb_dst_drop(skb); rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); if (add_it) { err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct nlattr **data = params->data; struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION);
12 11 4 4 8 8 1 6 2 1 1 1 15 15 18 18 15 17 17 17 17 16 1 23 23 2 6 1 14 9 2 1 2 10 1 10 21 1 1 4 9 6 1 9 6 6 2 1 1 6 6 5 3 1 3 1 1 1 10 2 8 63 64 1 63 1 59 42 1 14 56 2 3 30 24 3 2 24 33 11 32 12 3 18 24 2 25 19 10 26 2 25 26 22 2 16 7 1 4 9 26 26 7 4 11 6 6 1 1 2 1 1 2 2 2 1 1 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support for IPv6 * * Copyright (c) 2012 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/in6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include "l2tp_core.h" /* per-net private data for this module */ static unsigned int l2tp_ip6_net_id; struct l2tp_ip6_net { rwlock_t l2tp_ip6_lock; struct hlist_head l2tp_ip6_table; struct hlist_head l2tp_ip6_bind_table; }; struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; struct ipv6_pinfo inet6; }; static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) { return net_generic(net, l2tp_ip6_net_id); } static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) continue; if (!ipv6_addr_any(sk_raddr) && raddr && !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; pn = l2tp_ip6_pernet(net); if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_v3_session_get(net, NULL, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_put(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_put(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip6_hash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) { write_lock_bh(&pn->l2tp_ip6_lock); sk_add_node(sk, &pn->l2tp_ip6_table); write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); if (sk_unhashed(sk)) return; write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip6_hash(sk); return 0; } static void l2tp_ip6_close(struct sock *sk, long timeout) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); tunnel = l2tp_sk_to_tunnel(sk); if (tunnel) { l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; pn = l2tp_ip6_pernet(net); if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) return -EINVAL; addr_type = ipv6_addr_type(&addr->l2tp_addr); /* l2tp_ip6 sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; /* L2TP is point-point, not multicast */ if (addr_type & IPV6_ADDR_MULTICAST) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out_unlock; if (sk->sk_state != TCP_CLOSE) goto out_unlock; bound_dev_if = sk->sk_bound_dev_if; /* Check if the address belongs to the host. */ rcu_read_lock(); if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr->l2tp_scope_id) bound_dev_if = addr->l2tp_scope_id; /* Binding to link-local address requires an * interface. */ if (!bound_dev_if) goto out_unlock_rcu; err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), bound_dev_if); if (!dev) goto out_unlock_rcu; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0)) goto out_unlock_rcu; } rcu_read_unlock(); write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } inet->inet_saddr = v4addr; inet->inet_rcv_saddr = v4addr; sk->sk_bound_dev_if = bound_dev_if; sk->sk_v6_rcv_saddr = addr->l2tp_addr; np->saddr = addr->l2tp_addr; l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); return 0; out_unlock_rcu: rcu_read_unlock(); out_unlock: release_sock(sk); return err; } static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct in6_addr *daddr; int addr_type; int rc; struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -EINVAL; if (addr_type & IPV6_ADDR_MAPPED) { daddr = &usin->sin6_addr; if (ipv4_is_multicast(daddr->s6_addr32[3])) return -EINVAL; } lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip6_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; pn = l2tp_ip6_pernet(sock_net(sk)); write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip6_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk); lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr = sk->sk_v6_daddr; if (inet6_test_bit(SNDFLOW, sk)) lsa->l2tp_flowinfo = np->flow_label; } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) lsa->l2tp_addr = np->saddr; else lsa->l2tp_addr = sk->sk_v6_rcv_saddr; lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } static int l2tp_ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; __be32 *transhdr = NULL; int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; transhdr = (__be32 *)skb_transport_header(skb); *transhdr = 0; err = ip6_push_pending_frames(sk); out: return err; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6) return -EAFNOSUPPORT; daddr = &lsa->l2tp_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK; if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && lsa->l2tp_scope_id && ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL) fl6.flowi6_oif = lsa->l2tp_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen | opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: lock_sock(sk); ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0); err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = l2tp_ip6_push_pending_frames(sk); release_sock(sk); done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name); size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (lsa) { lsa->l2tp_family = AF_INET6; lsa->l2tp_unused = 0; lsa->l2tp_addr = ipv6_hdr(skb)->saddr; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; lsa->l2tp_conn_id = 0; if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = inet6_iif(skb); msg->msg_namelen = sizeof(*lsa); } if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } static struct proto l2tp_ip6_prot = { .name = "L2TP/IPv6", .owner = THIS_MODULE, .init = l2tp_ip6_open, .close = l2tp_ip6_close, .bind = l2tp_ip6_bind, .connect = l2tp_ip6_connect, .disconnect = l2tp_ip6_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip6_destroy_sock, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = l2tp_ip6_sendmsg, .recvmsg = l2tp_ip6_recvmsg, .backlog_rcv = l2tp_ip6_backlog_recv, .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip6_getname, .poll = datagram_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw l2tp_ip6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip6_prot, .ops = &l2tp_ip6_ops, }; static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; static __net_init int l2tp_ip6_init_net(struct net *net) { struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); rwlock_init(&pn->l2tp_ip6_lock); INIT_HLIST_HEAD(&pn->l2tp_ip6_table); INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); return 0; } static __net_exit void l2tp_ip6_exit_net(struct net *net) { struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); write_lock_bh(&pn->l2tp_ip6_lock); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); write_unlock_bh(&pn->l2tp_ip6_lock); } static struct pernet_operations l2tp_ip6_net_ops = { .init = l2tp_ip6_init_net, .exit = l2tp_ip6_exit_net, .id = &l2tp_ip6_net_id, .size = sizeof(struct l2tp_ip6_net), }; static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); err = register_pernet_device(&l2tp_ip6_net_ops); if (err) goto out; err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; out2: proto_unregister(&l2tp_ip6_prot); out1: unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } static void __exit l2tp_ip6_exit(void) { inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); module_exit(l2tp_ip6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Chris Elston <celston@katalix.com>"); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
6 51 51 6 51 6 5 2 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, struct elevator_tags *tags, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return queue_hctx((q), (q->tag_set->map[type].mq_map[cpu])); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * Default to double of smaller one between hw queue_depth and * 128, since we don't split into sync/async like the old code * did. Additionally, this is a per-hw queue depth. */ static inline unsigned int blk_mq_default_nr_requests( struct blk_mq_tag_set *set) { return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, unsigned int nr); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif
2565 2570 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/unwind_deferred.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { struct pid *pid = task_pid(p); nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); wake_up_all(&pid->wait_pidfd); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); pidfs_exit(p); cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk, struct core_state *core_state) { struct core_thread self; self.task = tsk; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE #ifdef CONFIG_STACK_GROWSUP unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ n--; } while (!*n); return (unsigned long)end_of_stack(p) - (unsigned long)n; } #else /* !CONFIG_STACK_GROWSUP */ unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ n++; } while (!*n); return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif /* CONFIG_STACK_GROWSUP */ /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} #endif /* CONFIG_DEBUG_STACK_USAGE */ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ tsk->flags |= PF_POSTCOREDUMP; core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); if (unlikely(core_state)) coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); /* * Since sampling can touch ->mm, make sure to stop everything before we * tear it down. * * Also flushes inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); /* * PF_EXITING (above) ensures unwind_deferred_request() will no * longer add new unwinds. While exit_mm() (below) will destroy the * abaility to do unwinds. So flush any pending unwinds here. */ unwind_deferred_task_exit(tsk); exit_mm(); if (group_dead) acct_process(); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); sched_autogroup_exit_task(tsk); cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
91 91 91 1 122 2448 12 4 11 23 17 14 282 15 297 7 242 72 1 72 72 48 1 54 59 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int timer_create_restore_ids:1; atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif #ifdef CONFIG_CGROUPS struct rw_semaphore cgroup_threadgroup_rwsem; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(&current->sighand->siglock); schedule(); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr); int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define for_other_threads(p, t) \ for (t = p; (t = next_thread(t)) != p; ) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } /* * returns NULL if p is the last thread in the thread group */ static inline struct task_struct *__next_thread(struct task_struct *p) { return list_next_or_null_rcu(&p->signal->thread_head, &p->thread_node, struct task_struct, thread_node); } static inline struct task_struct *next_thread(struct task_struct *p) { return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { return thread_group_leader(p) && list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) __acquires(&task->sighand->siglock); static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) __releases(&task->sighand->siglock) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
12 2 14 20 21 12 2 10 5 5 1 2 2 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 /* * algif_rng: User-space interface for random number generators * * This file provides the user-space API for random number generators. * * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 * are required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/capability.h> #include <linux/module.h> #include <crypto/rng.h> #include <linux/random.h> #include <crypto/if_alg.h> #include <linux/net.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>"); MODULE_DESCRIPTION("User-space interface for random number generators"); struct rng_ctx { #define MAXSIZE 128 unsigned int len; struct crypto_rng *drng; u8 *addtl; size_t addtl_len; }; struct rng_parent_ctx { struct crypto_rng *drng; u8 *entropy; }; static void rng_reset_addtl(struct rng_ctx *ctx) { kfree_sensitive(ctx->addtl); ctx->addtl = NULL; ctx->addtl_len = 0; } static int _rng_recvmsg(struct crypto_rng *drng, struct msghdr *msg, size_t len, u8 *addtl, size_t addtl_len) { int err = 0; int genlen = 0; u8 result[MAXSIZE]; if (len == 0) return 0; if (len > MAXSIZE) len = MAXSIZE; /* * although not strictly needed, this is a precaution against coding * errors */ memset(result, 0, len); /* * The enforcement of a proper seeding of an RNG is done within an * RNG implementation. Some RNGs (DRBG, krng) do not need specific * seeding as they automatically seed. The X9.31 DRNG will return * an error if it was not seeded properly. */ genlen = crypto_rng_generate(drng, addtl, addtl_len, result, len); if (genlen < 0) return genlen; err = memcpy_to_msg(msg, result, len); memzero_explicit(result, len); return err ? err : len; } static int rng_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; return _rng_recvmsg(ctx->drng, msg, len, NULL, 0); } static int rng_test_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; int ret; lock_sock(sock->sk); ret = _rng_recvmsg(ctx->drng, msg, len, ctx->addtl, ctx->addtl_len); rng_reset_addtl(ctx); release_sock(sock->sk); return ret; } static int rng_test_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct alg_sock *ask = alg_sk(sock->sk); struct rng_ctx *ctx = ask->private; lock_sock(sock->sk); if (len > MAXSIZE) { err = -EMSGSIZE; goto unlock; } rng_reset_addtl(ctx); ctx->addtl = kmalloc(len, GFP_KERNEL); if (!ctx->addtl) { err = -ENOMEM; goto unlock; } err = memcpy_from_msg(ctx->addtl, msg, len); if (err) { rng_reset_addtl(ctx); goto unlock; } ctx->addtl_len = len; unlock: release_sock(sock->sk); return err ? err : len; } static struct proto_ops algif_rng_ops = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, .sendmsg = sock_no_sendmsg, .release = af_alg_release, .recvmsg = rng_recvmsg, }; static struct proto_ops __maybe_unused algif_rng_test_ops = { .family = PF_ALG, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .bind = sock_no_bind, .accept = sock_no_accept, .release = af_alg_release, .recvmsg = rng_test_recvmsg, .sendmsg = rng_test_sendmsg, }; static void *rng_bind(const char *name, u32 type, u32 mask) { struct rng_parent_ctx *pctx; struct crypto_rng *rng; pctx = kzalloc_obj(*pctx); if (!pctx) return ERR_PTR(-ENOMEM); rng = crypto_alloc_rng(name, type, mask); if (IS_ERR(rng)) { kfree(pctx); return ERR_CAST(rng); } pctx->drng = rng; return pctx; } static void rng_release(void *private) { struct rng_parent_ctx *pctx = private; if (unlikely(!pctx)) return; crypto_free_rng(pctx->drng); kfree_sensitive(pctx->entropy); kfree_sensitive(pctx); } static void rng_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct rng_ctx *ctx = ask->private; rng_reset_addtl(ctx); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); } static int rng_accept_parent(void *private, struct sock *sk) { struct rng_ctx *ctx; struct rng_parent_ctx *pctx = private; struct alg_sock *ask = alg_sk(sk); unsigned int len = sizeof(*ctx); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) return -ENOMEM; memset(ctx, 0, len); ctx->len = len; /* * No seeding done at that point -- if multiple accepts are * done on one RNG instance, each resulting FD points to the same * state of the RNG. */ ctx->drng = pctx->drng; ask->private = ctx; sk->sk_destruct = rng_sock_destruct; /* * Non NULL pctx->entropy means that CAVP test has been initiated on * this socket, replace proto_ops algif_rng_ops with algif_rng_test_ops. */ if (IS_ENABLED(CONFIG_CRYPTO_USER_API_RNG_CAVP) && pctx->entropy) sk->sk_socket->ops = &algif_rng_test_ops; return 0; } static int rng_setkey(void *private, const u8 *seed, unsigned int seedlen) { struct rng_parent_ctx *pctx = private; /* * Check whether seedlen is of sufficient size is done in RNG * implementations. */ return crypto_rng_reset(pctx->drng, seed, seedlen); } static int __maybe_unused rng_setentropy(void *private, sockptr_t entropy, unsigned int len) { struct rng_parent_ctx *pctx = private; u8 *kentropy = NULL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (pctx->entropy) return -EINVAL; if (len > MAXSIZE) return -EMSGSIZE; if (len) { kentropy = memdup_sockptr(entropy, len); if (IS_ERR(kentropy)) return PTR_ERR(kentropy); } crypto_rng_alg(pctx->drng)->set_ent(pctx->drng, kentropy, len); /* * Since rng doesn't perform any memory management for the entropy * buffer, save kentropy pointer to pctx now to free it after use. */ pctx->entropy = kentropy; return 0; } static const struct af_alg_type algif_type_rng = { .bind = rng_bind, .release = rng_release, .accept = rng_accept_parent, .setkey = rng_setkey, #ifdef CONFIG_CRYPTO_USER_API_RNG_CAVP .setentropy = rng_setentropy, #endif .ops = &algif_rng_ops, .name = "rng", .owner = THIS_MODULE }; static int __init rng_init(void) { return af_alg_register_type(&algif_type_rng); } static void __exit rng_exit(void) { int err = af_alg_unregister_type(&algif_type_rng); BUG_ON(err); } module_init(rng_init); module_exit(rng_exit);
3 3 3 7 1 1 4 1 4 1 4 4 3 3 3 3 3 3 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2012 Siemens AG * * Written by: * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/netlink.h> #include <net/nl802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/route.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "cfg.h" static void ieee802154_tasklet_handler(struct tasklet_struct *t) { struct ieee802154_local *local = from_tasklet(local, t, tasklet); struct sk_buff *skb; while ((skb = skb_dequeue(&local->skb_queue))) { switch (skb->pkt_type) { case IEEE802154_RX_MSG: /* Clear skb->pkt_type in order to not confuse kernel * netstack. */ skb->pkt_type = 0; ieee802154_rx(local, skb); break; default: WARN(1, "mac802154: Packet is of unknown type %d\n", skb->pkt_type); kfree_skb(skb); break; } } } struct ieee802154_hw * ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) { struct wpan_phy *phy; struct ieee802154_local *local; size_t priv_size; if (WARN_ON(!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed || !ops->start || !ops->stop || !ops->set_channel)) return NULL; /* Ensure 32-byte alignment of our private data and hw private data. * We use the wpan_phy priv data for both our ieee802154_local and for * the driver's private data * * in memory it'll be like this: * * +-------------------------+ * | struct wpan_phy | * +-------------------------+ * | struct ieee802154_local | * +-------------------------+ * | driver's private data | * +-------------------------+ * * Due to ieee802154 layer isn't aware of driver and MAC structures, * so lets align them here. */ priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len; phy = wpan_phy_new(&mac802154_config_ops, priv_size); if (!phy) { pr_err("failure to allocate master IEEE802.15.4 device\n"); return NULL; } phy->privid = mac802154_wpan_phy_privid; local = wpan_phy_priv(phy); local->phy = phy; local->hw.phy = local->phy; local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN); local->ops = ops; INIT_LIST_HEAD(&local->interfaces); INIT_LIST_HEAD(&local->rx_beacon_list); INIT_LIST_HEAD(&local->rx_mac_cmd_list); mutex_init(&local->iflist_mtx); tasklet_setup(&local->tasklet, ieee802154_tasklet_handler); skb_queue_head_init(&local->skb_queue); INIT_WORK(&local->sync_tx_work, ieee802154_xmit_sync_worker); INIT_DELAYED_WORK(&local->scan_work, mac802154_scan_worker); INIT_WORK(&local->rx_beacon_work, mac802154_rx_beacon_worker); INIT_DELAYED_WORK(&local->beacon_work, mac802154_beacon_worker); INIT_WORK(&local->rx_mac_cmd_work, mac802154_rx_mac_cmd_worker); init_completion(&local->assoc_done); /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; phy->supported.min_maxbe = 3; phy->supported.max_maxbe = 8; phy->supported.min_frame_retries = 0; phy->supported.max_frame_retries = 7; phy->supported.max_csma_backoffs = 5; phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; /* always supported */ phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE) | BIT(NL802154_IFTYPE_COORD); return &local->hw; } EXPORT_SYMBOL(ieee802154_alloc_hw); void ieee802154_configure_durations(struct wpan_phy *phy, unsigned int page, unsigned int channel) { u32 duration = 0; switch (page) { case 0: if (BIT(channel) & 0x1) /* 868 MHz BPSK 802.15.4-2003: 20 ksym/s */ duration = 50 * NSEC_PER_USEC; else if (BIT(channel) & 0x7FE) /* 915 MHz BPSK 802.15.4-2003: 40 ksym/s */ duration = 25 * NSEC_PER_USEC; else if (BIT(channel) & 0x7FFF800) /* 2400 MHz O-QPSK 802.15.4-2006: 62.5 ksym/s */ duration = 16 * NSEC_PER_USEC; break; case 2: if (BIT(channel) & 0x1) /* 868 MHz O-QPSK 802.15.4-2006: 25 ksym/s */ duration = 40 * NSEC_PER_USEC; else if (BIT(channel) & 0x7FE) /* 915 MHz O-QPSK 802.15.4-2006: 62.5 ksym/s */ duration = 16 * NSEC_PER_USEC; break; case 3: if (BIT(channel) & 0x3FFF) /* 2.4 GHz CSS 802.15.4a-2007: 1/6 Msym/s */ duration = 6 * NSEC_PER_USEC; break; default: break; } if (!duration) { pr_debug("Unknown PHY symbol duration\n"); return; } phy->symbol_duration = duration; phy->lifs_period = (IEEE802154_LIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; phy->sifs_period = (IEEE802154_SIFS_PERIOD * phy->symbol_duration) / NSEC_PER_USEC; } EXPORT_SYMBOL(ieee802154_configure_durations); void ieee802154_free_hw(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); BUG_ON(!list_empty(&local->interfaces)); mutex_destroy(&local->iflist_mtx); wpan_phy_free(local->phy); } EXPORT_SYMBOL(ieee802154_free_hw); static void ieee802154_setup_wpan_phy_pib(struct wpan_phy *wpan_phy) { /* TODO warn on empty symbol_duration * Should be done when all drivers sets this value. */ wpan_phy->lifs_period = (IEEE802154_LIFS_PERIOD * wpan_phy->symbol_duration) / NSEC_PER_USEC; wpan_phy->sifs_period = (IEEE802154_SIFS_PERIOD * wpan_phy->symbol_duration) / NSEC_PER_USEC; } int ieee802154_register_hw(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); char mac_wq_name[IFNAMSIZ + 10] = {}; struct net_device *dev; int rc = -ENOSYS; local->workqueue = create_singlethread_workqueue(wpan_phy_name(local->phy)); if (!local->workqueue) { rc = -ENOMEM; goto out; } snprintf(mac_wq_name, IFNAMSIZ + 10, "%s-mac-cmds", wpan_phy_name(local->phy)); local->mac_wq = create_singlethread_workqueue(mac_wq_name); if (!local->mac_wq) { rc = -ENOMEM; goto out_wq; } hrtimer_setup(&local->ifs_timer, ieee802154_xmit_ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); wpan_phy_set_dev(local->phy, local->hw.parent); ieee802154_setup_wpan_phy_pib(local->phy); ieee802154_configure_durations(local->phy, local->phy->current_page, local->phy->current_channel); if (!(hw->flags & IEEE802154_HW_CSMA_PARAMS)) { local->phy->supported.min_csma_backoffs = 4; local->phy->supported.max_csma_backoffs = 4; local->phy->supported.min_maxbe = 5; local->phy->supported.max_maxbe = 5; local->phy->supported.min_minbe = 3; local->phy->supported.max_minbe = 3; } if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) { local->phy->supported.min_frame_retries = 3; local->phy->supported.max_frame_retries = 3; } if (hw->flags & IEEE802154_HW_PROMISCUOUS) local->phy->supported.iftypes |= BIT(NL802154_IFTYPE_MONITOR); rc = wpan_phy_register(local->phy); if (rc < 0) goto out_mac_wq; rtnl_lock(); dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM, NL802154_IFTYPE_NODE, cpu_to_le64(0x0000000000000000ULL)); if (IS_ERR(dev)) { rtnl_unlock(); rc = PTR_ERR(dev); goto out_phy; } rtnl_unlock(); return 0; out_phy: wpan_phy_unregister(local->phy); out_mac_wq: destroy_workqueue(local->mac_wq); out_wq: destroy_workqueue(local->workqueue); out: return rc; } EXPORT_SYMBOL(ieee802154_register_hw); void ieee802154_unregister_hw(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); tasklet_kill(&local->tasklet); flush_workqueue(local->workqueue); rtnl_lock(); ieee802154_remove_interfaces(local); rtnl_unlock(); destroy_workqueue(local->mac_wq); destroy_workqueue(local->workqueue); wpan_phy_unregister(local->phy); } EXPORT_SYMBOL(ieee802154_unregister_hw); static int __init ieee802154_init(void) { return ieee802154_iface_init(); } static void __exit ieee802154_exit(void) { ieee802154_iface_exit(); rcu_barrier(); } subsys_initcall(ieee802154_init); module_exit(ieee802154_exit); MODULE_DESCRIPTION("IEEE 802.15.4 subsystem"); MODULE_LICENSE("GPL v2");
2 115 188 210 93 93 25 284 284 12 282 1 1 1481 1480 662 594 52 109 42 640 640 646 50 119 602 121 602 205 203 1 205 454 3 3 3 511 512 205 205 204 660 641 133 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/types.h> #include "dev.h" enum lw_bits { LW_URGENT = 0, }; static unsigned long linkwatch_flags; static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; /* Some uppers (DSA) have additional sources for being down, so * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { struct net_device *peer; int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ if (dev->reg_state <= NETREG_REGISTERED) iflink = dev_get_iflink(dev); else iflink = dev->ifindex; if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; return netif_carrier_ok(peer) ? IF_OPER_DOWN : IF_OPER_LOWERLAYERDOWN; } if (netif_dormant(dev)) return IF_OPER_DORMANT; return IF_OPER_UP; } static void rfc2863_policy(struct net_device *dev) { unsigned int operstate = default_operstate(dev); if (operstate == READ_ONCE(dev->operstate)) return; switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) operstate = IF_OPER_TESTING; break; case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; case IF_LINK_MODE_DEFAULT: default: break; } WRITE_ONCE(dev->operstate, operstate); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ if (!netif_carrier_ok(dev) || netif_dormant(dev) || netif_testing(dev)) rfc2863_policy(dev); } static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) return false; if (dev->ifindex != dev_get_iflink(dev)) return true; if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* Minimise down-time: drop delay for up event. */ if (urgent) { if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; delay = 0; } /* If we wrap around we'll delay it by at most HZ. */ if (delay > HZ) delay = 0; /* * If urgent, schedule immediate execution; otherwise, don't * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else dev_deactivate(dev); netif_state_change(dev); } } static void __linkwatch_run_queue(int urgent_only) { #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; /* Use a local list here since we add non-urgent * events back to the global one when called with * urgent_only=1. */ LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) do_dev += MAX_DO_DEV_PER_LOOP; /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } /* We must free netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); /* Use __dev_put() because netdev_tracker_free() was already * called above. Must be after netdev_unlock_ops() to prevent * netdev_run_todo() from freeing the device while still in use. */ __dev_put(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } /* Add the remaining work back to lweventlist */ list_splice_init(&wrk, &lweventlist); if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = true; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); return clean; } void __linkwatch_sync_dev(struct net_device *dev) { netdev_ops_assert_locked(dev); if (linkwatch_clean_dev(dev)) { linkwatch_do_dev(dev); /* Use __dev_put() because netdev_tracker_free() was already * called inside linkwatch_clean_dev(). */ __dev_put(dev); } } void linkwatch_sync_dev(struct net_device *dev) { if (linkwatch_clean_dev(dev)) { netdev_lock_ops(dev); linkwatch_do_dev(dev); netdev_unlock_ops(dev); /* Use __dev_put() because netdev_tracker_free() was already * called inside linkwatch_clean_dev(). */ __dev_put(dev); } } /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { __linkwatch_run_queue(0); } static void linkwatch_event(struct work_struct *dummy) { rtnl_lock(); __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); rtnl_unlock(); } void linkwatch_fire_event(struct net_device *dev) { bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } EXPORT_SYMBOL(linkwatch_fire_event);
6 29 1 28 28 28 17 13 24 71 71 22 24 17 6 7 6 1 6 6 125 125 125 16 4 9 4 7 20 518 519 21 20 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 // SPDX-License-Identifier: GPL-2.0-only /* Event cache for netfilter. */ /* * (C) 2005 Harald Welte <laforge@gnumonks.org> * (C) 2005 Patrick McHardy <kaber@trash.net> * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> static DEFINE_MUTEX(nf_ct_ecache_mutex); #define DYING_NULLS_VAL ((1 << 30) + 1) #define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) #define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, STATE_RESTART, STATE_DONE, }; struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); return &cnet->ecache; } #if IS_MODULE(CONFIG_NF_CT_NETLINK) EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); #endif static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) { unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; unsigned int sent; INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); next: sent = 0; spin_lock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); /* The worker owns all entries, ct remains valid until nf_ct_put * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } if (sent++ > 16) { spin_unlock_bh(&cnet->ecache.dying_lock); cond_resched(); goto next; } } spin_unlock_bh(&cnet->ecache.dying_lock); hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); nf_ct_put(ct); cond_resched(); } return ret; } static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); int ret, delay = -1; ret = ecache_work_evict_list(cnet); switch (ret) { case STATE_CONGESTED: delay = ECACHE_RETRY_JIFFIES; break; case STATE_RESTART: delay = 0; break; case STATE_DONE: break; } if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, const u32 events, const u32 missed, const struct nf_ct_event *item) { struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; u32 old, want; int ret; if (!((events | missed) & e->ctmask)) return 0; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) { rcu_read_unlock(); return 0; } ret = notify->ct_event(events | missed, item); rcu_read_unlock(); if (likely(ret >= 0 && missed == 0)) return 0; do { old = READ_ONCE(e->missed); if (ret < 0) want = old | events; else want = old & ~missed; } while (cmpxchg(&e->missed, old, want) != old); return ret; } static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (local64_read(&e->timestamp)) local64_set(&e->timestamp, ktime_get_real_ns()); #endif } int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int missed; int ret; if (!nf_ct_is_confirmed(ct)) return 0; e = nf_ct_ecache_find(ct); if (!e) return 0; memset(&item, 0, sizeof(item)); item.ct = ct; item.portid = e->portid ? e->portid : portid; item.report = report; /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; nf_ct_ecache_tstamp_refresh(e); ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, * we store the PORTID to include it in the retransmission. */ if (e->portid == 0 && portid != 0) e->portid = portid; } return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { struct nf_conntrack_ecache *e; struct nf_ct_event item; unsigned int events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; events = xchg(&e->cache, 0); item.ct = ct; item.portid = 0; item.report = 0; /* We make a copy of the missed event cache without taking * the lock, thus we may send missed events twice. However, * this does not harm and it happens very rarely. */ __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report) { struct net *net = nf_ct_exp_net(exp); struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; e = nf_ct_ecache_find(exp->master); if (!e) goto out_unlock; if (e->expmask & (1 << event)) { struct nf_exp_event item = { .exp = exp, .portid = portid, .report = report }; notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } void nf_conntrack_register_notifier(struct net *net, const struct nf_ct_event_notifier *new) { struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); void nf_conntrack_unregister_notifier(struct net *net) { mutex_lock(&nf_ct_ecache_mutex); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); if (state == NFCT_ECACHE_DESTROY_FAIL && !delayed_work_pending(&cnet->ecache.dwork)) { schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } } static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP u64 ts = 0; if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) ts = ktime_get_real_ns(); local64_set(&e->timestamp, ts); #endif } bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; switch (net->ct.sysctl_events) { case 0: /* assignment via template / ruleset? ignore sysctl. */ if (ctmask || expmask) break; return true; case 2: /* autodetect: no event listener, don't allocate extension. */ if (!READ_ONCE(nf_ctnetlink_has_listener)) return true; fallthrough; case 1: /* always allocate an extension. */ if (!ctmask && !expmask) { ctmask = ~0; expmask = ~0; } break; default: WARN_ON_ONCE(1); return true; } e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { nf_ct_ecache_tstamp_new(ct, e); e->ctmask = ctmask; e->expmask = expmask; } return e != NULL; } EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); #define NF_CT_EVENTS_DEFAULT 2 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; void nf_conntrack_ecache_pernet_init(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); net->ct.sysctl_events = nf_ct_events; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } void nf_conntrack_ecache_pernet_fini(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); cancel_delayed_work_sync(&cnet->ecache.dwork); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/atmmpc.h> #include <linux/slab.h> #include <linux/time.h> #include "mpoa_caches.h" #include "mpc.h" /* * mpoa_caches.c: Implementation of ingress and egress cache * handling functions */ #if 0 #define dprintk(format, args...) \ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ #else #define dprintk(format, args...) \ do { if (0) \ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ } while (0) #endif #if 0 #define ddprintk(format, args...) \ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */ #else #define ddprintk(format, args...) \ do { if (0) \ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\ } while (0) #endif static in_cache_entry *in_cache_get(__be32 dst_ip, struct mpoa_client *client) { in_cache_entry *entry; read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if (entry->ctrl_info.in_dst_ip == dst_ip) { refcount_inc(&entry->use); read_unlock_bh(&client->ingress_lock); return entry; } entry = entry->next; } read_unlock_bh(&client->ingress_lock); return NULL; } static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip, struct mpoa_client *client, __be32 mask) { in_cache_entry *entry; read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if ((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask)) { refcount_inc(&entry->use); read_unlock_bh(&client->ingress_lock); return entry; } entry = entry->next; } read_unlock_bh(&client->ingress_lock); return NULL; } static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc, struct mpoa_client *client) { in_cache_entry *entry; read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if (entry->shortcut == vcc) { refcount_inc(&entry->use); read_unlock_bh(&client->ingress_lock); return entry; } entry = entry->next; } read_unlock_bh(&client->ingress_lock); return NULL; } static in_cache_entry *in_cache_add_entry(__be32 dst_ip, struct mpoa_client *client) { in_cache_entry *entry = kzalloc_obj(in_cache_entry); if (entry == NULL) { pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n"); return NULL; } dprintk("adding an ingress entry, ip = %pI4\n", &dst_ip); refcount_set(&entry->use, 1); dprintk("new_in_cache_entry: about to lock\n"); write_lock_bh(&client->ingress_lock); entry->next = client->in_cache; entry->prev = NULL; if (client->in_cache != NULL) client->in_cache->prev = entry; client->in_cache = entry; memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info.in_dst_ip = dst_ip; entry->time = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry->count = 1; entry->entry_state = INGRESS_INVALID; entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT; refcount_inc(&entry->use); write_unlock_bh(&client->ingress_lock); dprintk("new_in_cache_entry: unlocked\n"); return entry; } static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) { struct atm_mpoa_qos *qos; struct k_message msg; entry->count++; if (entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL) return OPEN; if (entry->entry_state == INGRESS_REFRESHING) { if (entry->count > mpc->parameters.mpc_p1) { msg.type = SND_MPOA_RES_RQST; msg.content.in_info = entry->ctrl_info; memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN); qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); entry->reply_wait = ktime_get_seconds(); entry->entry_state = INGRESS_RESOLVING; } if (entry->shortcut != NULL) return OPEN; return CLOSED; } if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL) return OPEN; if (entry->count > mpc->parameters.mpc_p1 && entry->entry_state == INGRESS_INVALID) { dprintk("(%s) threshold exceeded for ip %pI4, sending MPOA res req\n", mpc->dev->name, &entry->ctrl_info.in_dst_ip); entry->entry_state = INGRESS_RESOLVING; msg.type = SND_MPOA_RES_RQST; memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN); msg.content.in_info = entry->ctrl_info; qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, mpc); entry->reply_wait = ktime_get_seconds(); } return CLOSED; } static void in_cache_put(in_cache_entry *entry) { if (refcount_dec_and_test(&entry->use)) { kfree_sensitive(entry); } } /* * This should be called with write lock on */ static void in_cache_remove_entry(in_cache_entry *entry, struct mpoa_client *client) { struct atm_vcc *vcc; struct k_message msg; vcc = entry->shortcut; dprintk("removing an ingress entry, ip = %pI4\n", &entry->ctrl_info.in_dst_ip); if (entry->prev != NULL) entry->prev->next = entry->next; else client->in_cache = entry->next; if (entry->next != NULL) entry->next->prev = entry->prev; client->in_ops->put(entry); if (client->in_cache == NULL && client->eg_cache == NULL) { msg.type = STOP_KEEP_ALIVE_SM; msg_to_mpoad(&msg, client); } /* Check if the egress side still uses this VCC */ if (vcc != NULL) { eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc, client); if (eg_entry != NULL) { client->eg_ops->put(eg_entry); return; } vcc_release_async(vcc, -EPIPE); } } /* Call this every MPC-p2 seconds... Not exactly correct solution, but an easy one... */ static void clear_count_and_expired(struct mpoa_client *client) { in_cache_entry *entry, *next_entry; time64_t now; now = ktime_get_seconds(); write_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { entry->count = 0; next_entry = entry->next; if ((now - entry->time) > entry->ctrl_info.holding_time) { dprintk("holding time expired, ip = %pI4\n", &entry->ctrl_info.in_dst_ip); client->in_ops->remove_entry(entry, client); } entry = next_entry; } write_unlock_bh(&client->ingress_lock); } /* Call this every MPC-p4 seconds. */ static void check_resolving_entries(struct mpoa_client *client) { struct atm_mpoa_qos *qos; in_cache_entry *entry; time64_t now; struct k_message msg; now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); entry = client->in_cache; while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVING) { if ((now - entry->hold_down) < client->parameters.mpc_p6) { entry = entry->next; /* Entry in hold down */ continue; } if ((now - entry->reply_wait) > entry->retry_time) { entry->retry_time = MPC_C1 * (entry->retry_time); /* * Retry time maximum exceeded, * put entry in hold down. */ if (entry->retry_time > client->parameters.mpc_p5) { entry->hold_down = ktime_get_seconds(); entry->retry_time = client->parameters.mpc_p4; entry = entry->next; continue; } /* Ask daemon to send a resolution request. */ memset(&entry->hold_down, 0, sizeof(time64_t)); msg.type = SND_MPOA_RES_RTRY; memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN); msg.content.in_info = entry->ctrl_info; qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); if (qos != NULL) msg.qos = qos->qos; msg_to_mpoad(&msg, client); entry->reply_wait = ktime_get_seconds(); } } entry = entry->next; } read_unlock_bh(&client->ingress_lock); } /* Call this every MPC-p5 seconds. */ static void refresh_entries(struct mpoa_client *client) { time64_t now; struct in_cache_entry *entry = client->in_cache; ddprintk("refresh_entries\n"); now = ktime_get_seconds(); read_lock_bh(&client->ingress_lock); while (entry != NULL) { if (entry->entry_state == INGRESS_RESOLVED) { if (!(entry->refresh_time)) entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3; if ((now - entry->reply_wait) > entry->refresh_time) { dprintk("refreshing an entry.\n"); entry->entry_state = INGRESS_REFRESHING; } } entry = entry->next; } read_unlock_bh(&client->ingress_lock); } static void in_destroy_cache(struct mpoa_client *mpc) { write_lock_irq(&mpc->ingress_lock); while (mpc->in_cache != NULL) mpc->in_ops->remove_entry(mpc->in_cache, mpc); write_unlock_irq(&mpc->ingress_lock); } static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id, struct mpoa_client *mpc) { eg_cache_entry *entry; read_lock_irq(&mpc->egress_lock); entry = mpc->eg_cache; while (entry != NULL) { if (entry->ctrl_info.cache_id == cache_id) { refcount_inc(&entry->use); read_unlock_irq(&mpc->egress_lock); return entry; } entry = entry->next; } read_unlock_irq(&mpc->egress_lock); return NULL; } /* This can be called from any context since it saves CPU flags */ static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc) { unsigned long flags; eg_cache_entry *entry; read_lock_irqsave(&mpc->egress_lock, flags); entry = mpc->eg_cache; while (entry != NULL) { if (entry->ctrl_info.tag == tag) { refcount_inc(&entry->use); read_unlock_irqrestore(&mpc->egress_lock, flags); return entry; } entry = entry->next; } read_unlock_irqrestore(&mpc->egress_lock, flags); return NULL; } /* This can be called from any context since it saves CPU flags */ static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc, struct mpoa_client *mpc) { unsigned long flags; eg_cache_entry *entry; read_lock_irqsave(&mpc->egress_lock, flags); entry = mpc->eg_cache; while (entry != NULL) { if (entry->shortcut == vcc) { refcount_inc(&entry->use); read_unlock_irqrestore(&mpc->egress_lock, flags); return entry; } entry = entry->next; } read_unlock_irqrestore(&mpc->egress_lock, flags); return NULL; } static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr, struct mpoa_client *mpc) { eg_cache_entry *entry; read_lock_irq(&mpc->egress_lock); entry = mpc->eg_cache; while (entry != NULL) { if (entry->latest_ip_addr == ipaddr) { refcount_inc(&entry->use); read_unlock_irq(&mpc->egress_lock); return entry; } entry = entry->next; } read_unlock_irq(&mpc->egress_lock); return NULL; } static void eg_cache_put(eg_cache_entry *entry) { if (refcount_dec_and_test(&entry->use)) { kfree_sensitive(entry); } } /* * This should be called with write lock on */ static void eg_cache_remove_entry(eg_cache_entry *entry, struct mpoa_client *client) { struct atm_vcc *vcc; struct k_message msg; vcc = entry->shortcut; dprintk("removing an egress entry.\n"); if (entry->prev != NULL) entry->prev->next = entry->next; else client->eg_cache = entry->next; if (entry->next != NULL) entry->next->prev = entry->prev; client->eg_ops->put(entry); if (client->in_cache == NULL && client->eg_cache == NULL) { msg.type = STOP_KEEP_ALIVE_SM; msg_to_mpoad(&msg, client); } /* Check if the ingress side still uses this VCC */ if (vcc != NULL) { in_cache_entry *in_entry = client->in_ops->get_by_vcc(vcc, client); if (in_entry != NULL) { client->in_ops->put(in_entry); return; } vcc_release_async(vcc, -EPIPE); } } static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, struct mpoa_client *client) { eg_cache_entry *entry = kzalloc_obj(eg_cache_entry); if (entry == NULL) { pr_info("out of memory\n"); return NULL; } dprintk("adding an egress entry, ip = %pI4, this should be our IP\n", &msg->content.eg_info.eg_dst_ip); refcount_set(&entry->use, 1); dprintk("new_eg_cache_entry: about to lock\n"); write_lock_irq(&client->egress_lock); entry->next = client->eg_cache; entry->prev = NULL; if (client->eg_cache != NULL) client->eg_cache->prev = entry; client->eg_cache = entry; memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); entry->ctrl_info = msg->content.eg_info; entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; dprintk("new_eg_cache_entry cache_id %u\n", ntohl(entry->ctrl_info.cache_id)); dprintk("mps_ip = %pI4\n", &entry->ctrl_info.mps_ip); refcount_inc(&entry->use); write_unlock_irq(&client->egress_lock); dprintk("new_eg_cache_entry: unlocked\n"); return entry; } static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time) { entry->time = ktime_get_seconds(); entry->entry_state = EGRESS_RESOLVED; entry->ctrl_info.holding_time = holding_time; } static void clear_expired(struct mpoa_client *client) { eg_cache_entry *entry, *next_entry; time64_t now; struct k_message msg; now = ktime_get_seconds(); write_lock_irq(&client->egress_lock); entry = client->eg_cache; while (entry != NULL) { next_entry = entry->next; if ((now - entry->time) > entry->ctrl_info.holding_time) { msg.type = SND_EGRESS_PURGE; msg.content.eg_info = entry->ctrl_info; dprintk("egress_cache: holding time expired, cache_id = %u.\n", ntohl(entry->ctrl_info.cache_id)); msg_to_mpoad(&msg, client); client->eg_ops->remove_entry(entry, client); } entry = next_entry; } write_unlock_irq(&client->egress_lock); } static void eg_destroy_cache(struct mpoa_client *mpc) { write_lock_irq(&mpc->egress_lock); while (mpc->eg_cache != NULL) mpc->eg_ops->remove_entry(mpc->eg_cache, mpc); write_unlock_irq(&mpc->egress_lock); } static const struct in_cache_ops ingress_ops = { .add_entry = in_cache_add_entry, .get = in_cache_get, .get_with_mask = in_cache_get_with_mask, .get_by_vcc = in_cache_get_by_vcc, .put = in_cache_put, .remove_entry = in_cache_remove_entry, .cache_hit = cache_hit, .clear_count = clear_count_and_expired, .check_resolving = check_resolving_entries, .refresh = refresh_entries, .destroy_cache = in_destroy_cache }; static const struct eg_cache_ops egress_ops = { .add_entry = eg_cache_add_entry, .get_by_cache_id = eg_cache_get_by_cache_id, .get_by_tag = eg_cache_get_by_tag, .get_by_vcc = eg_cache_get_by_vcc, .get_by_src_ip = eg_cache_get_by_src_ip, .put = eg_cache_put, .remove_entry = eg_cache_remove_entry, .update = update_eg_cache_entry, .clear_expired = clear_expired, .destroy_cache = eg_destroy_cache }; void atm_mpoa_init_cache(struct mpoa_client *mpc) { mpc->in_ops = &ingress_ops; mpc->eg_ops = &egress_ops; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SIG_H #define _CRYPTO_SIG_H #include <linux/crypto.h> /** * struct crypto_sig - user-instantiated objects which encapsulate * algorithms and core processing logic * * @base: Common crypto API algorithm data structure */ struct crypto_sig { struct crypto_tfm base; }; /** * struct sig_alg - generic public key signature algorithm * * @sign: Function performs a sign operation as defined by public key * algorithm. On success, the signature size is returned. * Optional. * @verify: Function performs a complete verify operation as defined by * public key algorithm, returning verification status. Optional. * @set_pub_key: Function invokes the algorithm specific set public key * function, which knows how to decode and interpret * the BER encoded public key and parameters. Mandatory. * @set_priv_key: Function invokes the algorithm specific set private key * function, which knows how to decode and interpret * the BER encoded private key and parameters. Optional. * @key_size: Function returns key size. Mandatory. * @digest_size: Function returns maximum digest size. Optional. * @max_size: Function returns maximum signature size. Optional. * @init: Initialize the cryptographic transformation object. * This function is used to initialize the cryptographic * transformation object. This function is called only once at * the instantiation time, right after the transformation context * was allocated. In case the cryptographic hardware has some * special requirements which need to be handled by software, this * function shall check for the precise requirement of the * transformation and put any software fallbacks in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * * @base: Common crypto API algorithm data structure */ struct sig_alg { int (*sign)(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen); int (*verify)(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen); int (*set_pub_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); int (*set_priv_key)(struct crypto_sig *tfm, const void *key, unsigned int keylen); unsigned int (*key_size)(struct crypto_sig *tfm); unsigned int (*digest_size)(struct crypto_sig *tfm); unsigned int (*max_size)(struct crypto_sig *tfm); int (*init)(struct crypto_sig *tfm); void (*exit)(struct crypto_sig *tfm); struct crypto_alg base; }; /** * DOC: Generic Public Key Signature API * * The Public Key Signature API is used with the algorithms of type * CRYPTO_ALG_TYPE_SIG (listed as type "sig" in /proc/crypto) */ /** * crypto_alloc_sig() - allocate signature tfm handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * signing algorithm e.g. "ecdsa" * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * * Allocate a handle for public key signature algorithm. The returned struct * crypto_sig is the handle that is required for any subsequent * API invocation for signature operations. * * Return: allocated handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_sig_tfm(struct crypto_sig *tfm) { return &tfm->base; } static inline struct crypto_sig *__crypto_sig_tfm(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_sig, base); } static inline struct sig_alg *__crypto_sig_alg(struct crypto_alg *alg) { return container_of(alg, struct sig_alg, base); } static inline struct sig_alg *crypto_sig_alg(struct crypto_sig *tfm) { return __crypto_sig_alg(crypto_sig_tfm(tfm)->__crt_alg); } /** * crypto_free_sig() - free signature tfm handle * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_sig(struct crypto_sig *tfm) { crypto_destroy_tfm(tfm, crypto_sig_tfm(tfm)); } /** * crypto_sig_keysize() - Get key size * * Function returns the key size in bits. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_keysize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->key_size(tfm); } /** * crypto_sig_digestsize() - Get maximum digest size * * Function returns the maximum digest size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_digestsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->digest_size(tfm); } /** * crypto_sig_maxsize() - Get maximum signature size * * Function returns the maximum signature size in bytes. * Function assumes that the key is already set in the transformation. If this * function is called without a setkey or with a failed setkey, you may end up * in a NULL dereference. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() */ static inline unsigned int crypto_sig_maxsize(struct crypto_sig *tfm) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->max_size(tfm); } /** * crypto_sig_sign() - Invoke signing operation * * Function invokes the specific signing operation for a given algorithm * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @dst: destination obuffer * @dlen: destination length * * Return: signature size on success; error code in case of error */ static inline int crypto_sig_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->sign(tfm, src, slen, dst, dlen); } /** * crypto_sig_verify() - Invoke signature verification * * Function invokes the specific signature verification operation * for a given algorithm. * * @tfm: signature tfm handle allocated with crypto_alloc_sig() * @src: source buffer * @slen: source length * @digest: digest * @dlen: digest length * * Return: zero on verification success; error code in case of error. */ static inline int crypto_sig_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *digest, unsigned int dlen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->verify(tfm, src, slen, digest, dlen); } /** * crypto_sig_set_pubkey() - Invoke set public key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded public key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_pubkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_pub_key(tfm, key, keylen); } /** * crypto_sig_set_privkey() - Invoke set private key operation * * Function invokes the algorithm specific set key function, which knows * how to decode and interpret the encoded key and parameters * * @tfm: tfm handle * @key: BER encoded private key, algo OID, paramlen, BER encoded * parameters * @keylen: length of the key (not including other data) * * Return: zero on success; error code in case of error */ static inline int crypto_sig_set_privkey(struct crypto_sig *tfm, const void *key, unsigned int keylen) { struct sig_alg *alg = crypto_sig_alg(tfm); return alg->set_priv_key(tfm, key, keylen); } #endif
28 28 28 19 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/syscalls.h> #include <linux/time_namespace.h> #include "futex.h" /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. * * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after * acquiring the lock, but just before it could have added itself to * the list. There can only be one such pending lock. */ /** * sys_set_robust_list() - Set the robust-futex list head of a task * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { /* * The kernel knows only one size for now: */ if (unlikely(len != sizeof(*head))) return -EINVAL; current->robust_list = head; return 0; } static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat) { #ifdef CONFIG_COMPAT if (compat) return p->compat_robust_list; #endif return p->robust_list; } static void __user *futex_get_robust_list_common(int pid, bool compat) { struct task_struct *p = current; void __user *head; int ret; scoped_guard(rcu) { if (pid) { p = find_task_by_vpid(pid); if (!p) return (void __user *)ERR_PTR(-ESRCH); } get_task_struct(p); } /* * Hold exec_update_lock to serialize with concurrent exec() * so ptrace_may_access() is checked against stable credentials */ ret = down_read_killable(&p->signal->exec_update_lock); if (ret) goto err_put; ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = futex_task_robust_list(p, compat); up_read(&p->signal->exec_update_lock); put_task_struct(p); return head; err_unlock: up_read(&p->signal->exec_update_lock); err_put: put_task_struct(p); return (void __user *)ERR_PTR(ret); } /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { struct robust_list_head __user *head = futex_get_robust_list_common(pid, false); if (IS_ERR(head)) return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } static __always_inline bool futex_cmd_has_timeout(u32 cmd) { switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; } return false; } static __always_inline int futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) { if (!timespec64_valid(ts)) return -EINVAL; *t = timespec64_to_ktime(*ts); if (cmd == FUTEX_WAIT) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv * @wake: Wake to call when futex is woken * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { unsigned int flags; if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; flags = futex2_to_flags(aux.flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, aux.val)) return -EINVAL; futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; futexv[i].q.wake = wake; futexv[i].q.wake_data = wake_data; } return 0; } static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, clockid_t clockid, struct hrtimer_sleeper *to) { int flag_clkid = 0, flag_init = 0; struct timespec64 ts; ktime_t time; int ret; if (!timeout) return 0; if (clockid == CLOCK_REALTIME) { flag_clkid = FLAGS_CLOCKRT; flag_init = FUTEX_CLOCK_REALTIME; } if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return -EINVAL; if (get_timespec64(&ts, timeout)) return -EFAULT; /* * Since there's no opcode for futex_waitv, use * FUTEX_WAIT_BITSET that uses absolute timeout as well */ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); if (ret) return ret; futex_setup_timer(&time, to, flag_clkid, 0); return 0; } static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on * @nr_futexes: Length of futexv * @flags: Flag for timeout (monotonic/realtime) * @timeout: Optional absolute timeout. * @clockid: Clock to be used for the timeout, realtime or monotonic. * * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes * if a futex_wake() is performed at any uaddr. The syscall returns immediately * if any waiter has *uaddr != val. *timeout is an optional timeout value for * the operation. Each waiter has individual flags. The `flags` argument for * the syscall should be used solely for specifying the timeout as realtime, if * needed. Flags for private futexes, sizes, etc. should be used on the * individual flags of each waiter. * * Returns the array index of one of the woken futexes. No further information * is provided: any number of other futexes may also have been woken by the * same event, and if more than one futex was woken, the retrned index may * refer to any one of them. (It is not necessaryily the futex with the * smallest index, nor the one most recently woken, nor...) */ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, unsigned int, nr_futexes, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; struct futex_vector *futexv; int ret; /* This syscall supports no flags for now */ if (flags) return -EINVAL; if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; futexv = kzalloc_objs(*futexv, nr_futexes); if (!futexv) { ret = -ENOMEM; goto destroy_timer; } ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_wake - Wake a number of futexes * @uaddr: Address of the futex(es) to wake * @mask: bitmask * @nr: Number of the futexes to wake * @flags: FUTEX2 flags * * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_wake, void __user *, uaddr, unsigned long, mask, int, nr, unsigned int, flags) { if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, mask)) return -EINVAL; return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } /* * sys_futex_wait - Wait on a futex * @uaddr: Address of the futex to wait on * @val: Value of @uaddr * @mask: bitmask * @flags: FUTEX2 flags * @timeout: Optional absolute timeout * @clockid: Clock to be used for the timeout, realtime or monotonic * * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the * futex2 familiy of calls. */ SYSCALL_DEFINE6(futex_wait, void __user *, uaddr, unsigned long, val, unsigned long, mask, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; int ret; if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, val) || !futex_validate_input(flags, mask)) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_requeue - Requeue a waiter from one futex to another * @waiters: array describing the source and destination futex * @flags: unused * @nr_wake: number of futexes to wake * @nr_requeue: number of futexes to requeue * * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_requeue, struct futex_waitv __user *, waiters, unsigned int, flags, int, nr_wake, int, nr_requeue) { struct futex_vector futexes[2]; u32 cmpval; int ret; if (flags) return -EINVAL; if (!waiters) return -EINVAL; ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); if (ret) return ret; cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, nr_wake, nr_requeue, &cmpval, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; current->compat_robust_list = head; return 0; } COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true); if (IS_ERR(head)) return PTR_ERR(head); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */
8 5 9 7 7 9 7 5 5 60 3 57 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 // SPDX-License-Identifier: GPL-2.0-only /* * test/set flag bits stored in conntrack extension area. * * (C) 2013 Astaro GmbH & Co KG */ #include <linux/export.h> #include <linux/types.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; do { old = *address; tmp = (old & mask) ^ new; if (old == tmp) return 0; } while (cmpxchg(address, old, tmp) != old); return 1; } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words32) { struct nf_conn_labels *labels; unsigned int size, i; int changed = 0; u32 *dst; labels = nf_ct_labels_find(ct); if (!labels) return -ENOSPC; size = sizeof(labels->bits); if (size < (words32 * sizeof(u32))) words32 = size / sizeof(u32); dst = (u32 *) labels->bits; for (i = 0; i < words32; i++) changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); size /= sizeof(u32); for (i = words32; i < size; i++) /* pad */ replace_u32(&dst[i], 0, 0); if (changed) nf_conntrack_event_cache(IPCT_LABEL, ct); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); int nf_connlabels_get(struct net *net, unsigned int bits) { int v; if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); v = atomic_inc_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v <= 0); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_get); void nf_connlabels_put(struct net *net) { int v = atomic_dec_return_relaxed(&net->ct.labels_used); WARN_ON_ONCE(v < 0); } EXPORT_SYMBOL_GPL(nf_connlabels_put);
4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <crypto/aes.h> #include <crypto/hash.h> #include <crypto/kpp.h> #include <crypto/utils.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "ecdh_helper.h" #include "smp.h" #define SMP_DEV(hdev) \ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG #define SMP_DBG(fmt, ...) printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #else #define SMP_DBG(fmt, ...) no_printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #endif #define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) /* Keys which are not distributed with Secure Connections */ #define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY) #define SMP_TIMEOUT secs_to_jiffies(30) #define ID_ADDR_TIMEOUT msecs_to_jiffies(200) #define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ 0x3f : 0x07) #define KEY_DIST_MASK 0x07 /* Maximum message length that can be passed to aes_cmac */ #define CMAC_MSG_MAX 80 enum { SMP_FLAG_TK_VALID, SMP_FLAG_CFM_PENDING, SMP_FLAG_MITM_AUTH, SMP_FLAG_COMPLETE, SMP_FLAG_INITIATOR, SMP_FLAG_SC, SMP_FLAG_REMOTE_PK, SMP_FLAG_DEBUG_KEY, SMP_FLAG_WAIT_USER, SMP_FLAG_DHKEY_PENDING, SMP_FLAG_REMOTE_OOB, SMP_FLAG_LOCAL_OOB, SMP_FLAG_CT2, }; struct smp_dev { /* Secure Connections OOB data */ bool local_oob; u8 local_pk[64]; u8 local_rand[16]; bool debug_key; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; unsigned long allow_cmd; /* Bitmask of allowed commands */ u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ u8 prnd[16]; /* SMP Pairing Random (local) */ u8 rrnd[16]; /* SMP Pairing Random (remote) */ u8 pcnf[16]; /* SMP Pairing Confirm */ u8 tk[16]; /* SMP Temporary Key */ u8 rr[16]; /* Remote OOB ra/rb value */ u8 lr[16]; /* Local OOB ra/rb value */ u8 enc_key_size; u8 remote_key_dist; bdaddr_t id_addr; u8 id_addr_type; u8 irk[16]; struct smp_csrk *csrk; struct smp_csrk *responder_csrk; struct smp_ltk *ltk; struct smp_ltk *responder_ltk; struct smp_irk *remote_irk; u8 *link_key; unsigned long flags; u8 method; u8 passkey_round; /* Secure Connections variables */ u8 local_pk[64]; u8 remote_pk[64]; u8 dhkey[32]; u8 mackey[16]; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; /* These debug key values are defined in the SMP section of the core * specification. debug_pk is the public debug key and debug_sk the * private debug key. */ static const u8 debug_pk[64] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20, 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74, 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76, 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63, 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc, }; static const u8 debug_sk[32] = { 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58, 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a, 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74, 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f, }; static inline void swap_buf(const u8 *src, u8 *dst, size_t len) { size_t i; for (i = 0; i < len; i++) dst[len - 1 - i] = src[i]; } /* The following functions map to the LE SC SMP crypto functions * AES-CMAC, f4, f5, f6, g2 and h6. */ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m, size_t len, u8 mac[16]) { uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX]; int err; if (len > CMAC_MSG_MAX) return -EFBIG; if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; } /* Swap key and message from LSB to MSB */ swap_buf(k, tmp, 16); swap_buf(m, msg_msb, len); SMP_DBG("msg (len %zu) %*phN", len, (int) len, m); SMP_DBG("key %16phN", k); err = crypto_shash_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb); if (err) { BT_ERR("Hash computation error %d", err); return err; } swap_buf(mac_msb, mac, 16); SMP_DBG("mac %16phN", mac); return 0; } static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], u8 z, u8 res[16]) { u8 m[65]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN z %02x", x, z); m[0] = z; memcpy(m + 1, v, 32); memcpy(m + 33, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32], const u8 n1[16], const u8 n2[16], const u8 a1[7], const u8 a2[7], u8 mackey[16], u8 ltk[16]) { /* The btle, salt and length "magic" values are as defined in * the SMP section of the Bluetooth core specification. In ASCII * the btle value ends up being 'btle'. The salt is just a * random number whereas length is the value 256 in little * endian format. */ const u8 btle[4] = { 0x65, 0x6c, 0x74, 0x62 }; const u8 salt[16] = { 0xbe, 0x83, 0x60, 0x5a, 0xdb, 0x0b, 0x37, 0x60, 0x38, 0xa5, 0xf5, 0xaa, 0x91, 0x83, 0x88, 0x6c }; const u8 length[2] = { 0x00, 0x01 }; u8 m[53], t[16]; int err; SMP_DBG("w %32phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("a1 %7phN a2 %7phN", a1, a2); err = aes_cmac(tfm_cmac, salt, w, 32, t); if (err) return err; SMP_DBG("t %16phN", t); memcpy(m, length, 2); memcpy(m + 2, a2, 7); memcpy(m + 9, a1, 7); memcpy(m + 16, n2, 16); memcpy(m + 32, n1, 16); memcpy(m + 48, btle, 4); m[52] = 0; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), mackey); if (err) return err; SMP_DBG("mackey %16phN", mackey); m[52] = 1; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), ltk); if (err) return err; SMP_DBG("ltk %16phN", ltk); return 0; } static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 n1[16], const u8 n2[16], const u8 r[16], const u8 io_cap[3], const u8 a1[7], const u8 a2[7], u8 res[16]) { u8 m[65]; int err; SMP_DBG("w %16phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("r %16phN io_cap %3phN a1 %7phN a2 %7phN", r, io_cap, a1, a2); memcpy(m, a2, 7); memcpy(m + 7, a1, 7); memcpy(m + 14, io_cap, 3); memcpy(m + 17, r, 16); memcpy(m + 33, n2, 16); memcpy(m + 49, n1, 16); err = aes_cmac(tfm_cmac, w, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], const u8 y[16], u32 *val) { u8 m[80], tmp[16]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN y %16phN", x, y); memcpy(m, y, 16); memcpy(m + 16, v, 32); memcpy(m + 48, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), tmp); if (err) return err; *val = get_unaligned_le32(tmp); *val %= 1000000; SMP_DBG("val %06u", *val); return 0; } static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 key_id[4], u8 res[16]) { int err; SMP_DBG("w %16phN key_id %4phN", w, key_id); err = aes_cmac(tfm_cmac, w, key_id, 4, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 salt[16], u8 res[16]) { int err; SMP_DBG("w %16phN salt %16phN", w, salt); err = aes_cmac(tfm_cmac, salt, w, 16, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } /* The following functions map to the legacy SMP crypto functions e, c1, * s1 and ah. */ static int smp_e(const u8 *k, u8 *r) { struct aes_enckey aes; uint8_t tmp[16], data[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); err = aes_prepareenckey(&aes, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); aes_encrypt(&aes, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); SMP_DBG("r %16phN", r); memzero_explicit(&aes, sizeof(aes)); return err; } static int smp_c1(const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { u8 p1[16], p2[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); SMP_DBG("preq %7phN pres %7phN", preq, pres); memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ p1[0] = _iat; p1[1] = _rat; memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ crypto_xor_cpy(res, r, p1, sizeof(p1)); /* res = e(k, res) */ err = smp_e(k, res); if (err) { BT_ERR("Encrypt data error"); return err; } /* p2 = padding || ia || ra */ memcpy(p2, ra, 6); memcpy(p2 + 6, ia, 6); memset(p2 + 12, 0, 4); SMP_DBG("p2 %16phN", p2); /* res = res XOR p2 */ crypto_xor(res, p2, sizeof(p2)); /* res = e(k, res) */ err = smp_e(k, res); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_s1(const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; /* Just least significant octets from r1 and r2 are considered */ memcpy(_r, r2, 8); memcpy(_r + 8, r1, 8); err = smp_e(k, _r); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; int err; /* r' = padding || r */ memcpy(_res, r, 3); memset(_res + 3, 0, 13); err = smp_e(irk, _res); if (err) { BT_ERR("Encrypt error"); return err; } /* The output of the random address function ah is: * ah(k, r) = e(k, r') mod 2^24 * The output of the security function e is then truncated to 24 bits * by taking the least significant 24 bits of the output of e as the * result of ah. */ memcpy(res, _res, 3); return 0; } bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; u8 hash[3]; int err; if (!chan || !chan->data) return false; bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); if (err) return false; return !crypto_memneq(bdaddr->b, hash, 3); } int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; int err; if (!chan || !chan->data) return -EOPNOTSUPP; get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ rpa->b[5] |= 0x40; /* Set second most significant bit */ err = smp_ah(irk, &rpa->b[3], rpa->b); if (err < 0) return err; bt_dev_dbg(hdev, "RPA %pMR", rpa); return 0; } int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; smp = chan->data; if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); if (err) return err; memcpy(smp->local_pk, debug_pk, 64); smp->debug_key = true; } else { while (true) { /* Generate key pair for Secure Connections */ err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk); if (err) return err; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } smp->debug_key = false; } SMP_DBG("OOB Public Key X: %32phN", smp->local_pk); SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32); get_random_bytes(smp->local_rand, 16); err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk, smp->local_rand, 0, hash); if (err < 0) return err; memcpy(rand, smp->local_rand, 16); smp->local_oob = true; return 0; } static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; struct kvec iv[2]; struct msghdr msg; if (!chan) return; bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code); iv[0].iov_base = &code; iv[0].iov_len = 1; iv[1].iov_base = data; iv[1].iov_len = len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len, NULL); if (!chan->data) return; smp = chan->data; cancel_delayed_work_sync(&smp->security_timer); schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static u8 authreq_to_seclevel(u8 authreq) { if (authreq & SMP_AUTH_MITM) { if (authreq & SMP_AUTH_SC) return BT_SECURITY_FIPS; else return BT_SECURITY_HIGH; } else { return BT_SECURITY_MEDIUM; } } static __u8 seclevel_to_authreq(__u8 sec_level) { switch (sec_level) { case BT_SECURITY_FIPS: case BT_SECURITY_HIGH: return SMP_AUTH_MITM | SMP_AUTH_BONDING; case BT_SECURITY_MEDIUM: return SMP_AUTH_BONDING; default: return SMP_AUTH_NONE; } } static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; } else { authreq &= ~SMP_AUTH_BONDING; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && (authreq & SMP_AUTH_SC)) { struct oob_data *oob_data; u8 bdaddr_type; if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { local_dist |= SMP_DIST_LINK_KEY; remote_dist |= SMP_DIST_LINK_KEY; } if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) bdaddr_type = BDADDR_LE_PUBLIC; else bdaddr_type = BDADDR_LE_RANDOM; oob_data = hci_find_remote_oob_data(hdev, &hcon->dst, bdaddr_type); if (oob_data && oob_data->present) { set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags); oob_flag = SMP_OOB_PRESENT; memcpy(smp->rr, oob_data->rand256, 16); memcpy(smp->pcnf, oob_data->hash256, 16); SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf); SMP_DBG("OOB Remote Random: %16phN", smp->rr); } } else { authreq &= ~SMP_AUTH_SC; } if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; req->max_key_size = hdev->le_max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = remote_dist; return; } rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; rsp->max_key_size = hdev->le_max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = rsp->init_key_dist; } static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS && max_key_size != SMP_MAX_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; return 0; } static void smp_chan_destroy(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bool complete; BUG_ON(!smp); cancel_delayed_work_sync(&smp->security_timer); complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(hcon, complete); kfree_sensitive(smp->csrk); kfree_sensitive(smp->responder_csrk); kfree_sensitive(smp->link_key); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); /* Ensure that we don't leave any debug key around if debug key * support hasn't been explicitly enabled. */ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG && !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); smp->ltk = NULL; } /* If pairing failed clean up any keys we might have */ if (!complete) { if (smp->ltk) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); } if (smp->responder_ltk) { list_del_rcu(&smp->responder_ltk->list); kfree_rcu(smp->responder_ltk, rcu); } if (smp->remote_irk) { list_del_rcu(&smp->remote_irk->list); kfree_rcu(smp->remote_irk, rcu); } } chan->data = NULL; kfree_sensitive(smp); hci_conn_drop(hcon); } static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) smp_chan_destroy(conn); } #define JUST_WORKS 0x00 #define JUST_CFM 0x01 #define REQ_PASSKEY 0x02 #define CFM_PASSKEY 0x03 #define REQ_OOB 0x04 #define DSP_PASSKEY 0x05 #define OVERLAP 0xFF static const u8 gen_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP }, }; static const u8 sc_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { DSP_PASSKEY, DSP_PASSKEY, REQ_PASSKEY, JUST_WORKS, DSP_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { DSP_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, }; static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) { /* If either side has unknown io_caps, use JUST_CFM (which gets * converted later to JUST_WORKS if we're initiators. */ if (local_io > SMP_IO_KEYBOARD_DISPLAY || remote_io > SMP_IO_KEYBOARD_DISPLAY) return JUST_CFM; if (test_bit(SMP_FLAG_SC, &smp->flags)) return sc_method[remote_io][local_io]; return gen_method[remote_io][local_io]; } static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io, remote_io); /* If neither side wants MITM, either "just" confirm an incoming * request or use just-works for outgoing ones. The JUST_CFM * will be converted to JUST_WORKS if necessary later in this * function. If either side has MITM look up the method from the * table. */ if (!(auth & SMP_AUTH_MITM)) smp->method = JUST_CFM; else smp->method = get_auth_method(smp, local_io, remote_io); /* Don't confirm locally initiated pairing attempts */ if (smp->method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = JUST_WORKS; /* Don't bother user space with no IO capabilities */ if (smp->method == JUST_CFM && hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; /* If Just Works, Continue with Zero TK and ask user-space for * confirmation */ if (smp->method == JUST_WORKS) { ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); if (ret) return ret; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* If this function is used for SC -> legacy fallback we * can only recover the just-works case. */ if (test_bit(SMP_FLAG_SC, &smp->flags)) return -EINVAL; /* Not Just Works/Confirm results in MITM Authentication */ if (smp->method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); if (hcon->pending_sec_level < BT_SECURITY_HIGH) hcon->pending_sec_level = BT_SECURITY_HIGH; } /* If both devices have Keyboard-Display I/O, the initiator * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = CFM_PASSKEY; else smp->method = REQ_PASSKEY; } /* Generate random passkey. */ if (smp->method == CFM_PASSKEY) { memset(smp->tk, 0, sizeof(smp->tk)); get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } if (smp->method == REQ_PASSKEY) ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type); else if (smp->method == JUST_CFM) ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); else ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 0); return ret; } static u8 smp_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct smp_cmd_pairing_confirm cp; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, conn->hcon->resp_addr_type, &conn->hcon->resp_addr, cp.confirm_val); if (ret) return SMP_UNSPECIFIED; clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static u8 smp_random(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 confirm[16]; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, hcon->resp_addr_type, &hcon->resp_addr, confirm); if (ret) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) { bt_dev_err(hcon->hdev, "pairing failed " "(confirmation values mismatch)"); return SMP_CONFIRM_FAILED; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 stk[16]; __le64 rand = 0; __le16 ediv = 0; smp_s1(smp->tk, smp->rrnd, smp->prnd, stk); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { u8 stk[16], auth; __le64 rand = 0; __le16 ediv = 0; smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); smp_s1(smp->tk, smp->prnd, smp->rrnd, stk); if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else auth = 0; /* Even though there's no _RESPONDER suffix this is the * responder STK we're adding for later lookup (the initiator * STK never needs to be stored). */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, SMP_STK, auth, stk, smp->enc_key_size, ediv, rand); } return 0; } static void smp_notify_keys(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req = (void *) &smp->preq[1]; struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; bool persistent; if (hcon->type == ACL_LINK) { if (hcon->key_type == HCI_LK_DEBUG_COMBINATION) persistent = false; else persistent = !test_bit(HCI_CONN_FLUSH_KEY, &hcon->flags); } else { /* The LTKs, IRKs and CSRKs should be persistent only if * both sides had the bonding bit set in their * authentication requests. */ persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); } if (smp->remote_irk) { mgmt_new_irk(hdev, smp->remote_irk, persistent); /* Now that user space can be considered to know the * identity address track the connection based on it * from now on (assuming this is an LE link). */ if (hcon->type == LE_LINK) { bacpy(&hcon->dst, &smp->remote_irk->bdaddr); hcon->dst_type = smp->remote_irk->addr_type; /* Use a short delay to make sure the new address is * propagated _before_ the channels. */ queue_delayed_work(hdev->workqueue, &conn->id_addr_timer, ID_ADDR_TIMEOUT); } } if (smp->csrk) { smp->csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->csrk, persistent); } if (smp->responder_csrk) { smp->responder_csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { smp->ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->ltk, persistent); } if (smp->responder_ltk) { smp->responder_ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->responder_ltk, persistent); } if (smp->link_key) { struct link_key *key; u8 type; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) type = HCI_LK_DEBUG_COMBINATION; else if (hcon->sec_level == BT_SECURITY_FIPS) type = HCI_LK_AUTH_COMBINATION_P256; else type = HCI_LK_UNAUTH_COMBINATION_P256; key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst, smp->link_key, type, 0, &persistent); if (key) { mgmt_new_link_key(hdev, key, persistent); /* Don't keep debug keys around if the relevant * flag is not set. */ if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) && key->type == HCI_LK_DEBUG_COMBINATION) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } } } static void sc_add_ltk(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; u8 key_type, auth; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) key_type = SMP_LTK_P256_DEBUG; else key_type = SMP_LTK_P256; if (hcon->pending_sec_level == BT_SECURITY_FIPS) auth = 1; else auth = 0; smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); } static void sc_generate_link_key(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'lebr'. */ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c }; smp->link_key = kzalloc(16, GFP_KERNEL); if (!smp->link_key) return; if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7031 */ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } else { /* From core spec. Spells out in ASCII as 'tmp1'. */ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } static void smp_allow_key_dist(struct smp_chan *smp) { /* Allow the first expected phase 3 PDU. The rest of the PDUs * will be allowed in each PDU handler to ensure we receive * them in the correct order. */ if (smp->remote_key_dist & SMP_DIST_ENC_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); else if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); } static void sc_generate_ltk(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'brle'. */ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 }; struct hci_conn *hcon = smp->conn->hcon; struct hci_dev *hdev = hcon->hdev; struct link_key *key; key = hci_find_link_key(hdev, &hcon->dst); if (!key) { bt_dev_err(hdev, "no Link Key found to generate LTK"); return; } if (key->type == HCI_LK_DEBUG_COMBINATION) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7032 */ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk)) return; } else { /* From core spec. Spells out in ASCII as 'tmp2'. */ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) return; } if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk)) return; sc_add_ltk(smp); } static void smp_distribute_keys(struct smp_chan *smp) { struct smp_cmd_pairing *req, *rsp; struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; bt_dev_dbg(hdev, "conn %p", conn); rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) && (smp->remote_key_dist & KEY_DIST_MASK)) { smp_allow_key_dist(smp); return; } req = (void *) &smp->preq[1]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { keydist = &rsp->resp_key_dist; *keydist &= req->resp_key_dist; } if (test_bit(SMP_FLAG_SC, &smp->flags)) { if (hcon->type == LE_LINK && (*keydist & SMP_DIST_LINK_KEY)) sc_generate_link_key(smp); if (hcon->type == ACL_LINK && (*keydist & SMP_DIST_ENC_KEY)) sc_generate_ltk(smp); /* Clear the keys which are generated but not distributed */ *keydist &= ~SMP_SC_NO_DIST; } bt_dev_dbg(hdev, "keydist 0x%x", *keydist); if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; struct smp_cmd_initiator_ident ident; struct smp_ltk *ltk; u8 authenticated; __le16 ediv; __le64 rand; /* Make sure we generate only the significant amount of * bytes based on the encryption key size, and set the rest * of the value to zeroes. */ get_random_bytes(enc.ltk, smp->enc_key_size); memset(enc.ltk + smp->enc_key_size, 0, sizeof(enc.ltk) - smp->enc_key_size); get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); authenticated = hcon->sec_level == BT_SECURITY_HIGH; ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK_RESPONDER, authenticated, enc.ltk, smp->enc_key_size, ediv, rand); smp->responder_ltk = ltk; ident.ediv = ediv; ident.rand = rand; smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident), &ident); *keydist &= ~SMP_DIST_ENC_KEY; } if (*keydist & SMP_DIST_ID_KEY) { struct smp_cmd_ident_addr_info addrinfo; struct smp_cmd_ident_info idinfo; memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); /* The hci_conn contains the local identity address * after the connection has been established. * * This is true even when the connection has been * established using a resolvable random address. */ bacpy(&addrinfo.bdaddr, &hcon->src); addrinfo.addr_type = hcon->src_type; smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), &addrinfo); *keydist &= ~SMP_DIST_ID_KEY; } if (*keydist & SMP_DIST_SIGN) { struct smp_cmd_sign_info sign; struct smp_csrk *csrk; /* Generate a new random key */ get_random_bytes(sign.csrk, sizeof(sign.csrk)); csrk = kzalloc_obj(*csrk); if (csrk) { if (hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED; else csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } smp->responder_csrk = csrk; smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); *keydist &= ~SMP_DIST_SIGN; } /* If there are still keys to be received wait for them */ if (smp->remote_key_dist & KEY_DIST_MASK) { smp_allow_key_dist(smp); return; } set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); smp_chan_destroy(conn); } static void smp_timeout(struct work_struct *work) { struct smp_chan *smp = container_of(work, struct smp_chan, security_timer.work); struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); hci_disconnect(conn->hcon, HCI_ERROR_AUTH_FAILURE); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; smp = kzalloc_obj(*smp, GFP_ATOMIC); if (!smp) return NULL; smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context"); goto zfree_smp; } smp->tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context"); goto free_shash; } smp->conn = conn; chan->data = smp; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL); INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(hcon); return smp; free_shash: crypto_free_shash(smp->tfm_cmac); zfree_smp: kfree_sensitive(smp); return NULL; } static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) { struct hci_conn *hcon = smp->conn->hcon; u8 *na, *nb, a[7], b[7]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { na = smp->prnd; nb = smp->rrnd; } else { na = smp->rrnd; nb = smp->prnd; } memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; return smp_f5(smp->tfm_cmac, smp->dhkey, na, nb, a, b, mackey, ltk); } static void sc_dhkey_check(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; struct smp_cmd_dhkey_check check; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16]; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->preq[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->prsp[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); if (smp->method == REQ_OOB) memcpy(r, smp->rr, 16); smp_f6(smp->tfm_cmac, smp->mackey, smp->prnd, smp->rrnd, r, io_cap, local_addr, remote_addr, check.e); smp_send_cmd(smp->conn, SMP_CMD_DHKEY_CHECK, sizeof(check), &check); } static u8 sc_passkey_send_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct smp_cmd_pairing_confirm cfm; u8 r; r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; get_random_bytes(smp->prnd, sizeof(smp->prnd)); if (smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, r, cfm.confirm_val)) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); return 0; } static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 cfm[16], r; /* Ignore the PDU if we've already done 20 rounds (0 - 19) */ if (smp->passkey_round >= 20) return 0; switch (smp_op) { case SMP_CMD_PAIRING_RANDOM: r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; if (smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, r, cfm)) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; smp->passkey_round++; if (smp->passkey_round == 20) { /* Generate MacKey and LTK */ if (sc_mackey_and_ltk(smp, smp->mackey, smp->tk)) return SMP_UNSPECIFIED; } /* The round is only complete when the initiator * receives pairing random. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); if (smp->passkey_round == 20) SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return 0; } /* Start the next round */ if (smp->passkey_round != 20) return sc_passkey_round(smp, 0); /* Passkey rounds are complete - start DHKey Check */ sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); break; case SMP_CMD_PAIRING_CONFIRM: if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); return 0; } return sc_passkey_send_confirm(smp); case SMP_CMD_PUBLIC_KEY: default: /* Initiating device starts the round */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; bt_dev_dbg(hdev, "Starting passkey round %u", smp->passkey_round + 1); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_send_confirm(smp); } return 0; } static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 smp_op; clear_bit(SMP_FLAG_WAIT_USER, &smp->flags); switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_NEG_REPLY: smp_failure(smp->conn, SMP_PASSKEY_ENTRY_FAILED); return 0; case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(smp->conn, SMP_NUMERIC_COMP_FAILED); return 0; case MGMT_OP_USER_PASSKEY_REPLY: hcon->passkey_notify = le32_to_cpu(passkey); smp->passkey_round = 0; if (test_and_clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) smp_op = SMP_CMD_PAIRING_CONFIRM; else smp_op = 0; if (sc_passkey_round(smp, smp_op)) return -EIO; return 0; } /* Initiator sends DHKey check first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) { sc_dhkey_check(smp); sc_add_ltk(smp); } return 0; } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; u32 value; int err; if (!conn) return -ENOTCONN; bt_dev_dbg(conn->hcon->hdev, ""); chan = conn->smp; if (!chan) return -ENOTCONN; l2cap_chan_lock(chan); if (!chan->data) { err = -ENOTCONN; goto unlock; } smp = chan->data; if (test_bit(SMP_FLAG_SC, &smp->flags)) { err = sc_user_reply(smp, mgmt_op, passkey); goto unlock; } switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = 0; goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = -EOPNOTSUPP; goto unlock; } err = 0; /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); if (rsp) smp_failure(conn, rsp); } unlock: l2cap_chan_unlock(chan); return err; } static void build_bredr_pairing_cmd(struct smp_chan *smp, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp) { struct l2cap_conn *conn = smp->conn; struct hci_dev *hdev = conn->hcon->hdev; u8 local_dist = 0, remote_dist = 0; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (!rsp) { memset(req, 0, sizeof(*req)); req->auth_req = SMP_AUTH_CT2; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; return; } memset(rsp, 0, sizeof(*rsp)); rsp->auth_req = SMP_AUTH_CT2; rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; smp->remote_key_dist = rsp->init_key_dist; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; u8 key_size, auth, sec_level; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; if (!smp) { smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; } /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); /* SMP over BR/EDR requires special treatment */ if (conn->hcon->type == ACL_LINK) { /* We must have a BR/EDR SC link */ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return SMP_CROSS_TRANSP_NOT_ALLOWED; set_bit(SMP_FLAG_SC, &smp->flags); build_bredr_pairing_cmd(smp, req, &rsp); if (req->auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); smp_distribute_keys(smp); return 0; } build_pairing_cmd(conn, req, &rsp, auth); if (rsp.auth_req & SMP_AUTH_SC) { set_bit(SMP_FLAG_SC, &smp->flags); if (rsp.auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); } if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, conn->hcon->io_capability, req->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); /* Strictly speaking we shouldn't allow Pairing Confirm for the * SC case, however some implementations incorrectly copy RFU auth * req bits from our security request, which may create a false * positive SC enablement. */ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_SC, &smp->flags)) { SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; /* Wait for Public Key from Initiating Device */ return 0; } /* Request setup of TK */ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability); if (ret) return SMP_UNSPECIFIED; return 0; } static u8 sc_send_public_key(struct smp_chan *smp) { struct hci_dev *hdev = smp->conn->hcon->hdev; bt_dev_dbg(hdev, ""); if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp_dev; if (!chan || !chan->data) return SMP_UNSPECIFIED; smp_dev = chan->data; memcpy(smp->local_pk, smp_dev->local_pk, 64); memcpy(smp->lr, smp_dev->local_rand, 16); if (smp_dev->debug_key) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); goto done; } if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk)) return SMP_UNSPECIFIED; memcpy(smp->local_pk, debug_pk, 64); set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); } else { while (true) { /* Generate key pair for Secure Connections */ if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } } done: SMP_DBG("Local Public Key X: %32phN", smp->local_pk); SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32); smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk); return 0; } static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; u8 key_size, auth; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); req = (void *) &smp->preq[1]; key_size = min(req->max_key_size, rsp->max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; auth = rsp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2)) set_bit(SMP_FLAG_CT2, &smp->flags); /* For BR/EDR this means we're done and can start phase 3 */ if (conn->hcon->type == ACL_LINK) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp_distribute_keys(smp); return 0; } if ((req->auth_req & SMP_AUTH_SC) && (auth & SMP_AUTH_SC)) set_bit(SMP_FLAG_SC, &smp->flags); else if (conn->hcon->pending_sec_level > BT_SECURITY_HIGH) conn->hcon->pending_sec_level = BT_SECURITY_HIGH; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, req->io_capability, rsp->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } get_random_bytes(smp->prnd, sizeof(smp->prnd)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if (test_bit(SMP_FLAG_SC, &smp->flags)) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); return sc_send_public_key(smp); } auth |= req->auth_req; ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); /* Can't compose response until we have been confirmed */ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); return 0; } static u8 sc_check_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, ""); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); } return 0; } /* Work-around for some implementations that incorrectly copy RFU bits * from our security request and thereby create the impression that * we're doing SC when in fact the remote doesn't support it. */ static int fixup_sc_false_positive(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req, *rsp; u8 auth; /* The issue is only observed when we're in responder role */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { bt_dev_err(hdev, "refusing legacy fallback in SC-only mode"); return SMP_UNSPECIFIED; } bt_dev_err(hdev, "trying to fall back to legacy SMP"); req = (void *) &smp->preq[1]; rsp = (void *) &smp->prsp[1]; /* Rebuild key dist flags which may have been cleared for SC */ smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist); auth = req->auth_req & AUTH_REQ_MASK(hdev); if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) { bt_dev_err(hdev, "failed to fall back to legacy SMP"); return SMP_UNSPECIFIED; } clear_bit(SMP_FLAG_SC, &smp->flags); return 0; } static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; bt_dev_dbg(hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); if (test_bit(SMP_FLAG_SC, &smp->flags)) { int ret; /* Public Key exchange must happen before any other steps */ if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) return sc_check_confirm(smp); bt_dev_err(hdev, "Unexpected SMP Pairing Confirm"); ret = fixup_sc_false_positive(smp); if (ret) return ret; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; u8 *pkax, *pkbx, *na, *nb, confirm_hint; u32 passkey = 0; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); skb_pull(skb, sizeof(smp->rrnd)); if (!test_bit(SMP_FLAG_SC, &smp->flags)) return smp_random(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { pkax = smp->local_pk; pkbx = smp->remote_pk; na = smp->prnd; nb = smp->rrnd; } else { pkax = smp->remote_pk; pkbx = smp->local_pk; na = smp->rrnd; nb = smp->prnd; } if (smp->method == REQ_OOB) { if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); goto mackey_and_ltk; } /* Passkey entry has special treatment */ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 cfm[16]; err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, 0, cfm); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; } else { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } mackey_and_ltk: /* Generate MacKey and LTK */ err = sc_mackey_and_ltk(smp, smp->mackey, smp->tk); if (err) return SMP_UNSPECIFIED; if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } return 0; } err = smp_g2(smp->tfm_cmac, pkax, pkbx, na, nb, &passkey); if (err) return SMP_UNSPECIFIED; /* Always require user confirmation for Just-Works pairing to prevent * impersonation attacks, or in case of a legitimate device that is * repairing use the confirmation as acknowledgment to proceed with the * creation of new keys. */ confirm_hint = smp->method == JUST_WORKS ? 1 : 0; err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) { struct smp_ltk *key; struct hci_conn *hcon = conn->hcon; key = hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role); if (!key) return false; if (smp_ltk_sec_level(key) < sec_level) return false; if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for initiator role, so clear this flag */ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); return true; } bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref) { if (sec_level == BT_SECURITY_LOW) return true; /* If we're encrypted with an STK but the caller prefers using * LTK claim insufficient security. This way we allow the * connection to be re-encrypted with an LTK, even if the LTK * provides the same level of security. Only exception is if we * don't have an LTK (e.g. because of key distribution bits). */ if (key_pref == SMP_USE_LTK && test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) && hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role)) return false; if (hcon->sec_level >= sec_level) return true; return false; } static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_pairing cp; if (smp->conn->hcon->type == ACL_LINK) build_bredr_pairing_cmd(smp, &cp, NULL); else build_pairing_cmd(smp->conn, &cp, NULL, auth); smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); set_bit(SMP_FLAG_INITIATOR, &smp->flags); } static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_security_req *rp = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; u8 sec_level, auth; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; if (hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; auth = rp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) { /* If link is already encrypted with sufficient security we * still need refresh encryption as per Core Spec 5.0 Vol 3, * Part H 2.4.6 */ smp_ltk_encrypt(conn, hcon->sec_level); return 0; } if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; skb_pull(skb, sizeof(*rp)); smp_send_pairing_req(smp, auth); return 0; } static void smp_send_security_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_security_req cp; cp.auth_req = auth; smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); } int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; __u8 authreq; int ret; bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); /* This may be NULL if there's an unexpected disconnection */ if (!conn) return 1; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) return 1; if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (hcon->role == HCI_ROLE_MASTER) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; chan = conn->smp; if (!chan) { bt_dev_err(hcon->hdev, "security requested but not available"); return 1; } l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ if (chan->data) { ret = 0; goto unlock; } smp = smp_chan_create(conn); if (!smp) { ret = 1; goto unlock; } authreq = seclevel_to_authreq(sec_level); if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) { authreq |= SMP_AUTH_SC; if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED)) authreq |= SMP_AUTH_CT2; } /* Don't attempt to set MITM if setting is overridden by debugfs * Needed to pass certification test SM/MAS/PKE/BV-01-C */ if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { /* Require MITM if IO Capability allows or the security level * requires it. */ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || hcon->pending_sec_level > BT_SECURITY_MEDIUM) authreq |= SMP_AUTH_MITM; } if (hcon->role == HCI_ROLE_MASTER) smp_send_pairing_req(smp, authreq); else smp_send_security_req(smp, authreq); ret = 0; unlock: l2cap_chan_unlock(chan); return ret; } int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct hci_conn *hcon; struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; int err; err = hci_remove_ltk(hdev, bdaddr, addr_type); hci_remove_irk(hdev, bdaddr, addr_type); hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); if (!hcon) goto done; conn = hcon->l2cap_data; if (!conn) goto done; chan = conn->smp; if (!chan) goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { /* Set keys to NULL to make sure smp_failure() does not try to * remove and free already invalidated rcu list entries. */ smp->ltk = NULL; smp->responder_ltk = NULL; smp->remote_irk = NULL; if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); err = 0; } l2cap_chan_unlock(chan); done: return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK, rp->ltk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "LTK blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT); skb_pull(skb, sizeof(*rp)); memcpy(smp->tk, rp->ltk, sizeof(smp->tk)); return 0; } static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_initiator_ident *rp = (void *)skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; u8 authenticated; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*rp)); authenticated = (hcon->sec_level == BT_SECURITY_HIGH); ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK, authenticated, smp->tk, smp->enc_key_size, rp->ediv, rp->rand); smp->ltk = ltk; if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK, info->irk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "Identity key blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); memcpy(smp->irk, info->irk, 16); return 0; } static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; bt_dev_dbg(hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*info)); /* Strictly speaking the Core Specification (4.1) allows sending * an empty address which would force us to rely on just the IRK * as "identity information". However, since such * implementations are not known of and in order to not over * complicate our implementation, simply pretend that we never * received an IRK for such a device. * * The Identity Address must also be a Static Random or Public * Address, which hci_is_identity_address() checks for. */ if (!bacmp(&info->bdaddr, BDADDR_ANY) || !hci_is_identity_address(&info->bdaddr, info->addr_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with no identity address"); goto distribute; } /* Drop IRK if peer is using identity address during pairing but is * providing different address as identity information. * * Microsoft Surface Precision Mouse is known to have this bug. */ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) && (bacmp(&info->bdaddr, &hcon->dst) || info->addr_type != hcon->dst_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with invalid identity address"); goto distribute; } bacpy(&smp->id_addr, &info->bdaddr); smp->id_addr_type = info->addr_type; if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type)) bacpy(&rpa, &hcon->dst); else bacpy(&rpa, BDADDR_ANY); smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr, smp->id_addr_type, smp->irk, &rpa); distribute: if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct smp_csrk *csrk; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; skb_pull(skb, sizeof(*rp)); csrk = kzalloc_obj(*csrk); if (csrk) { if (conn->hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED; else csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED; memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; smp_distribute_keys(smp); return 0; } static u8 sc_select_method(struct smp_chan *smp) { struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) || test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) return REQ_OOB; /* The preq/prsp contain the raw Pairing Request/Response PDUs * which are needed as inputs to some crypto functions. To get * the "struct smp_cmd_pairing" from them we need to skip the * first byte which contains the opcode. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local = (void *) &smp->preq[1]; remote = (void *) &smp->prsp[1]; } else { local = (void *) &smp->prsp[1]; remote = (void *) &smp->preq[1]; } local_io = local->io_capability; remote_io = remote->io_capability; local_mitm = (local->auth_req & SMP_AUTH_MITM); remote_mitm = (remote->auth_req & SMP_AUTH_MITM); /* If either side wants MITM, look up the method from the table, * otherwise use JUST WORKS. */ if (local_mitm || remote_mitm) method = get_auth_method(smp, local_io, remote_io); else method = JUST_WORKS; /* Don't confirm locally initiated pairing attempts */ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) method = JUST_WORKS; return method; } static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_public_key *key = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = hcon->hdev; struct crypto_kpp *tfm_ecdh; struct smp_cmd_pairing_confirm cfm; int err; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*key)) return SMP_INVALID_PARAMS; /* Check if remote and local public keys are the same and debug key is * not in use. */ if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && !crypto_memneq(key, smp->local_pk, 64)) { bt_dev_err(hdev, "Remote and local public keys are identical"); return SMP_UNSPECIFIED; } memcpy(smp->remote_pk, key, 64); if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk, smp->rr, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(cfm.confirm_val, smp->pcnf, 16)) return SMP_CONFIRM_FAILED; } /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { err = sc_send_public_key(smp); if (err) return err; } SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); /* Compute the shared secret on the same crypto tfm on which the private * key was set/generated. */ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *hchan = hdev->smp_data; struct smp_dev *smp_dev; if (!hchan || !hchan->data) return SMP_UNSPECIFIED; smp_dev = hchan->data; tfm_ecdh = smp_dev->tfm_ecdh; } else { tfm_ecdh = smp->tfm_ecdh; } if (compute_ecdh_secret(tfm_ecdh, smp->remote_pk, smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); set_bit(SMP_FLAG_REMOTE_PK, &smp->flags); smp->method = sc_select_method(smp); bt_dev_dbg(hdev, "selected method 0x%02x", smp->method); /* JUST_WORKS and JUST_CFM result in an unauthenticated key */ if (smp->method == JUST_WORKS || smp->method == JUST_CFM) hcon->pending_sec_level = BT_SECURITY_MEDIUM; else hcon->pending_sec_level = BT_SECURITY_FIPS; if (!crypto_memneq(debug_pk, smp->remote_pk, 64)) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (smp->method == DSP_PASSKEY) { get_random_bytes(&hcon->passkey_notify, sizeof(hcon->passkey_notify)); hcon->passkey_notify %= 1000000; hcon->passkey_entered = 0; smp->passkey_round = 0; if (mgmt_user_passkey_notify(hdev, &hcon->dst, hcon->type, hcon->dst_type, hcon->passkey_notify, hcon->passkey_entered)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_round(smp, SMP_CMD_PUBLIC_KEY); } if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (smp->method == REQ_PASSKEY) { if (mgmt_user_passkey_request(hdev, &hcon->dst, hcon->type, hcon->dst_type)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* The Initiating device waits for the non-initiating device to * send the confirm value. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_dhkey_check *check = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp = chan->data; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16], e[16]; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*check)) return SMP_INVALID_PARAMS; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->prsp[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->preq[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); else if (smp->method == REQ_OOB) memcpy(r, smp->lr, 16); err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r, io_cap, remote_addr, local_addr, e); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(check->e, e, 16)) return SMP_DHKEY_CHECK_FAILED; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags); return 0; } /* Responder sends DHKey check as response to initiator */ sc_dhkey_check(smp); } sc_add_ltk(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } return 0; } static int smp_cmd_keypress_notify(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_keypress_notify *kp = (void *) skb->data; bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value); return 0; } static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; __u8 code, reason; int err = 0; if (skb->len < 1) return -EILSEQ; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) { reason = SMP_PAIRING_NOTSUPP; goto done; } code = skb->data[0]; skb_pull(skb, sizeof(code)); smp = chan->data; if (code > SMP_CMD_MAX) goto drop; if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) { /* If there is a context and the command is not allowed consider * it a failure so the session is cleanup properly. */ switch (code) { case SMP_CMD_IDENT_INFO: case SMP_CMD_IDENT_ADDR_INFO: case SMP_CMD_SIGN_INFO: /* 3.6.1. Key distribution and generation * * A device may reject a distributed key by sending the * Pairing Failed command with the reason set to * "Key Rejected". */ smp_failure(conn, SMP_KEY_REJECTED); break; } goto drop; } /* If we don't have a context the only allowed commands are * pairing request and security request. */ if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ) goto drop; switch (code) { case SMP_CMD_PAIRING_REQ: reason = smp_cmd_pairing_req(conn, skb); break; case SMP_CMD_PAIRING_FAIL: smp_failure(conn, 0); err = -EPERM; break; case SMP_CMD_PAIRING_RSP: reason = smp_cmd_pairing_rsp(conn, skb); break; case SMP_CMD_SECURITY_REQ: reason = smp_cmd_security_req(conn, skb); break; case SMP_CMD_PAIRING_CONFIRM: reason = smp_cmd_pairing_confirm(conn, skb); break; case SMP_CMD_PAIRING_RANDOM: reason = smp_cmd_pairing_random(conn, skb); break; case SMP_CMD_ENCRYPT_INFO: reason = smp_cmd_encrypt_info(conn, skb); break; case SMP_CMD_INITIATOR_IDENT: reason = smp_cmd_initiator_ident(conn, skb); break; case SMP_CMD_IDENT_INFO: reason = smp_cmd_ident_info(conn, skb); break; case SMP_CMD_IDENT_ADDR_INFO: reason = smp_cmd_ident_addr_info(conn, skb); break; case SMP_CMD_SIGN_INFO: reason = smp_cmd_sign_info(conn, skb); break; case SMP_CMD_PUBLIC_KEY: reason = smp_cmd_public_key(conn, skb); break; case SMP_CMD_DHKEY_CHECK: reason = smp_cmd_dhkey_check(conn, skb); break; case SMP_CMD_KEYPRESS_NOTIFY: reason = smp_cmd_keypress_notify(conn, skb); break; default: bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code); reason = SMP_CMD_NOTSUPP; goto done; } done: if (!err) { if (reason) smp_failure(conn, reason); kfree_skb(skb); } return err; drop: bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR", code, &hcon->dst); kfree_skb(skb); return 0; } static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; bt_dev_dbg(conn->hcon->hdev, "chan %p", chan); if (chan->data) smp_chan_destroy(conn); conn->smp = NULL; l2cap_chan_put(chan); } static void bredr_pairing(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; bt_dev_dbg(hdev, "chan %p", chan); /* Only new pairings are interesting */ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags)) return; /* Don't bother if we're not encrypted */ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; /* Only initiator may initiate SMP over BR/EDR */ if (hcon->role != HCI_ROLE_MASTER) return; /* Secure Connections support must be enabled */ if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED)) return; /* BR/EDR must use Secure Connections for SMP */ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return; /* If our LE support is not enabled don't do anything */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; /* Don't bother if remote LE support is not enabled */ if (!lmp_host_le_capable(hcon)) return; /* Remote must support SMP fixed chan for BR/EDR */ if (!(conn->remote_fixed_chan & L2CAP_FC_SMP_BREDR)) return; /* Don't bother if SMP is already ongoing */ if (chan->data) return; smp = smp_chan_create(conn); if (!smp) { bt_dev_err(hdev, "unable to create SMP context for BR/EDR"); return; } set_bit(SMP_FLAG_SC, &smp->flags); bt_dev_dbg(hdev, "starting SMP over BR/EDR"); smp_send_pairing_req(smp, 0x00); } static void smp_resume_cb(struct l2cap_chan *chan) { struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); if (hcon->type == ACL_LINK) { bredr_pairing(chan); return; } if (!smp) return; if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; cancel_delayed_work(&smp->security_timer); smp_distribute_keys(smp); } static void smp_ready_cb(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); /* No need to call l2cap_chan_hold() here since we already own * the reference taken in smp_new_conn_cb(). This is just the * first time that we tie it to a specific pointer. The code in * l2cap_core.c ensures that there's no risk this function won't * get called if smp_new_conn_cb was previously called. */ conn->smp = chan; if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); } static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { int err; bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); err = smp_sig_channel(chan, skb); if (err) { struct smp_chan *smp = chan->data; if (smp) cancel_delayed_work_sync(&smp->security_timer); hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE); } return err; } static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); skb->priority = HCI_PRIO_MAX; bt_cb(skb)->l2cap.chan = chan; return skb; } static const struct l2cap_ops smp_chan_ops = { .name = "Security Manager", .ready = smp_ready_cb, .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, .resume = smp_resume_cb, .new_connection = l2cap_chan_no_new_connection, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; BT_DBG("pchan %p", pchan); chan = l2cap_chan_create(); if (!chan) return NULL; chan->chan_type = pchan->chan_type; chan->ops = &smp_chan_ops; chan->scid = pchan->scid; chan->dcid = chan->scid; chan->imtu = pchan->imtu; chan->omtu = pchan->omtu; chan->mode = pchan->mode; /* Other L2CAP channels may request SMP routines in order to * change the security level. This means that the SMP channel * lock must be considered in its own category to avoid lockdep * warnings. */ atomic_set(&chan->nesting, L2CAP_NESTING_SMP); BT_DBG("created chan %p", chan); return chan; } static const struct l2cap_ops smp_root_chan_ops = { .name = "Security Manager Root", .new_connection = smp_new_conn_cb, /* None of these are implemented for the root channel */ .close = l2cap_chan_no_close, .alloc_skb = l2cap_chan_no_alloc_skb, .recv = l2cap_chan_no_recv, .state_change = l2cap_chan_no_state_change, .teardown = l2cap_chan_no_teardown, .ready = l2cap_chan_no_ready, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .resume = l2cap_chan_no_resume, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; if (cid == L2CAP_CID_SMP_BREDR) { smp = NULL; goto create_chan; } smp = kzalloc_obj(*smp); if (!smp) return ERR_PTR(-ENOMEM); tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { bt_dev_err(hdev, "Unable to create CMAC crypto context"); kfree_sensitive(smp); return ERR_CAST(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { bt_dev_err(hdev, "Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); kfree_sensitive(smp); return ERR_CAST(tfm_ecdh); } smp->local_oob = false; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } return ERR_PTR(-ENOMEM); } chan->data = smp; l2cap_add_scid(chan, cid); l2cap_chan_set_defaults(chan); if (cid == L2CAP_CID_SMP) { u8 bdaddr_type; hci_copy_identity_address(hdev, &chan->src, &bdaddr_type); if (bdaddr_type == ADDR_LE_DEV_PUBLIC) chan->src_type = BDADDR_LE_PUBLIC; else chan->src_type = BDADDR_LE_RANDOM; } else { bacpy(&chan->src, &hdev->bdaddr); chan->src_type = BDADDR_BREDR; } chan->state = BT_LISTEN; chan->mode = L2CAP_MODE_BASIC; chan->imtu = L2CAP_DEFAULT_MTU; chan->ops = &smp_root_chan_ops; /* Set correct nesting level for a parent/listening channel */ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); return chan; } static void smp_del_chan(struct l2cap_chan *chan) { struct smp_dev *smp; BT_DBG("chan %p", chan); smp = chan->data; if (smp) { chan->data = NULL; crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } l2cap_chan_put(chan); } int smp_force_bredr(struct hci_dev *hdev, bool enable) { if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; if (enable) { struct l2cap_chan *chan; chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_bredr_data = chan; } else { struct l2cap_chan *chan; chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP); return 0; } int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; bt_dev_dbg(hdev, ""); /* If the controller does not support Low Energy operation, then * there is also no need to register any SMP channel. */ if (!lmp_le_capable(hdev)) return 0; if (WARN_ON(hdev->smp_data)) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_data = chan; if (!lmp_sc_capable(hdev)) { /* Flag can be already set here (due to power toggle) */ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return 0; } if (WARN_ON(hdev->smp_bredr_data)) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) { int err = PTR_ERR(chan); chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); return err; } hdev->smp_bredr_data = chan; return 0; } void smp_unregister(struct hci_dev *hdev) { struct l2cap_chan *chan; if (hdev->smp_bredr_data) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } if (hdev->smp_data) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } } #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { u8 pk[64]; int err; err = set_ecdh_privkey(tfm_ecdh, debug_sk); if (err) return err; err = generate_ecdh_public_key(tfm_ecdh, pk); if (err) return err; if (crypto_memneq(pk, debug_pk, 64)) return -EINVAL; return 0; } static int __init test_ah(void) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 r[3] = { 0x94, 0x81, 0x70 }; const u8 exp[3] = { 0xaa, 0xfb, 0x0d }; u8 res[3]; int err; err = smp_ah(irk, r, res); if (err) return err; if (crypto_memneq(res, exp, 3)) return -EINVAL; return 0; } static int __init test_c1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r[16] = { 0xe0, 0x2e, 0x70, 0xc6, 0x4e, 0x27, 0x88, 0x63, 0x0e, 0x6f, 0xad, 0x56, 0x21, 0xd5, 0x83, 0x57 }; const u8 preq[7] = { 0x01, 0x01, 0x00, 0x00, 0x10, 0x07, 0x07 }; const u8 pres[7] = { 0x02, 0x03, 0x00, 0x00, 0x08, 0x00, 0x05 }; const u8 _iat = 0x01; const u8 _rat = 0x00; const bdaddr_t ra = { { 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1 } }; const bdaddr_t ia = { { 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1 } }; const u8 exp[16] = { 0x86, 0x3b, 0xf1, 0xbe, 0xc5, 0x4d, 0xa7, 0xd2, 0xea, 0x88, 0x89, 0x87, 0xef, 0x3f, 0x1e, 0x1e }; u8 res[16]; int err; err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_s1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r1[16] = { 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 }; const u8 r2[16] = { 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99 }; const u8 exp[16] = { 0x62, 0xa0, 0x6d, 0x79, 0xae, 0x16, 0x42, 0x5b, 0x9b, 0xf4, 0xb0, 0xe8, 0xf0, 0xe1, 0x1f, 0x9a }; u8 res[16]; int err; err = smp_s1(k, r1, r2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f4(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 z = 0x00; const u8 exp[16] = { 0x2d, 0x87, 0x74, 0xa9, 0xbe, 0xa1, 0xed, 0xf1, 0x1c, 0xbd, 0xa9, 0x07, 0xf1, 0x16, 0xc9, 0xf2 }; u8 res[16]; int err; err = smp_f4(tfm_cmac, u, v, x, z, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f5(struct crypto_shash *tfm_cmac) { const u8 w[32] = { 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86, 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99, 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp_ltk[16] = { 0x38, 0x0a, 0x75, 0x94, 0xb5, 0x22, 0x05, 0x98, 0x23, 0xcd, 0xd7, 0x69, 0x11, 0x79, 0x86, 0x69 }; const u8 exp_mackey[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; u8 mackey[16], ltk[16]; int err; err = smp_f5(tfm_cmac, w, n1, n2, a1, a2, mackey, ltk); if (err) return err; if (crypto_memneq(mackey, exp_mackey, 16)) return -EINVAL; if (crypto_memneq(ltk, exp_ltk, 16)) return -EINVAL; return 0; } static int __init test_f6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 r[16] = { 0xc8, 0x0f, 0x2d, 0x0c, 0xd2, 0x42, 0xda, 0x08, 0x54, 0xbb, 0x53, 0xb4, 0x3b, 0x34, 0xa3, 0x12 }; const u8 io_cap[3] = { 0x02, 0x01, 0x01 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp[16] = { 0x61, 0x8f, 0x95, 0xda, 0x09, 0x0b, 0x6c, 0xd2, 0xc5, 0xe8, 0xd0, 0x9c, 0x98, 0x73, 0xc4, 0xe3 }; u8 res[16]; int err; err = smp_f6(tfm_cmac, w, n1, n2, r, io_cap, a1, a2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_g2(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 y[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u32 exp_val = 0x2f9ed5ba % 1000000; u32 val; int err; err = smp_g2(tfm_cmac, u, v, x, y, &val); if (err) return err; if (val != exp_val) return -EINVAL; return 0; } static int __init test_h6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 key_id[4] = { 0x72, 0x62, 0x65, 0x6c }; const u8 exp[16] = { 0x99, 0x63, 0xb1, 0x80, 0xe2, 0xa9, 0xd3, 0xe8, 0x1c, 0xc9, 0x6d, 0xe7, 0x02, 0xe1, 0x9a, 0x2d }; u8 res[16]; int err; err = smp_h6(tfm_cmac, w, key_id, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static char test_smp_buffer[32]; static ssize_t test_smp_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer, strlen(test_smp_buffer)); } static const struct file_operations test_smp_fops = { .open = simple_open, .read = test_smp_read, .llseek = default_llseek, }; static int __init run_selftests(struct crypto_shash *tfm_cmac, struct crypto_kpp *tfm_ecdh) { ktime_t calltime, delta, rettime; unsigned long long duration; int err; calltime = ktime_get(); err = test_debug_key(tfm_ecdh); if (err) { BT_ERR("debug_key test failed"); goto done; } err = test_ah(); if (err) { BT_ERR("smp_ah test failed"); goto done; } err = test_c1(); if (err) { BT_ERR("smp_c1 test failed"); goto done; } err = test_s1(); if (err) { BT_ERR("smp_s1 test failed"); goto done; } err = test_f4(tfm_cmac); if (err) { BT_ERR("smp_f4 test failed"); goto done; } err = test_f5(tfm_cmac); if (err) { BT_ERR("smp_f5 test failed"); goto done; } err = test_f6(tfm_cmac); if (err) { BT_ERR("smp_f6 test failed"); goto done; } err = test_g2(tfm_cmac); if (err) { BT_ERR("smp_g2 test failed"); goto done; } err = test_h6(tfm_cmac); if (err) { BT_ERR("smp_h6 test failed"); goto done; } rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; BT_INFO("SMP test passed in %llu usecs", duration); done: if (!err) snprintf(test_smp_buffer, sizeof(test_smp_buffer), "PASS (%llu usecs)\n", duration); else snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n"); debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL, &test_smp_fops); return err; } int __init bt_selftest_smp(void) { struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; int err; tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); return PTR_ERR(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); return PTR_ERR(tfm_ecdh); } err = run_selftests(tfm_cmac, tfm_ecdh); crypto_free_shash(tfm_cmac); crypto_free_kpp(tfm_ecdh); return err; } #endif
7 7 7 7 7 7 7 5 1 7 7 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 // SPDX-License-Identifier: GPL-2.0-or-later /* * Resilient Queued Spin Lock * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. * * Authors: Waiman Long <longman@redhat.com> * Peter Zijlstra <peterz@infradead.org> * Kumar Kartikeya Dwivedi <memxor@gmail.com> */ #include <linux/smp.h> #include <linux/bug.h> #include <linux/bpf.h> #include <linux/err.h> #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/mutex.h> #include <linux/prefetch.h> #include <asm/byteorder.h> #ifdef CONFIG_QUEUED_SPINLOCKS #include <asm/qspinlock.h> #endif #include <trace/events/lock.h> #include <asm/rqspinlock.h> #include <linux/timekeeping.h> /* * Include queued spinlock definitions and statistics code */ #ifdef CONFIG_QUEUED_SPINLOCKS #include "../locking/qspinlock.h" #include "../locking/lock_events.h" #include "rqspinlock.h" #include "../locking/mcs_spinlock.h" #endif /* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and * Scott") is available at * * https://bugzilla.kernel.org/show_bug.cgi?id=206115 * * This queued spinlock implementation is based on the MCS lock, however to * make it fit the 4 bytes we assume spinlock_t to be, and preserve its * existing API, we must modify it somehow. * * In particular; where the traditional MCS lock consists of a tail pointer * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to * unlock the next pending (next->locked), we compress both these: {tail, * next->locked} into a single u32 value. * * Since a spinlock disables recursion of its own context and there is a limit * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now * we can encode the tail by combining the 2-bit nesting level with the cpu * number. With one byte for the lock value and 3 bytes for the tail, only a * 32-bit word is now needed. Even though we only need 1 bit for the lock, * we extend it to a full byte to achieve better performance for architectures * that support atomic byte write. * * We also change the first spinner to spin on the lock bit instead of its * node; whereby avoiding the need to carry a node from lock to unlock, and * preserving existing lock API. This also makes the unlock code simpler and * faster. * * N.B. The current implementation only supports architectures that allow * atomic operations on smaller 8-bit and 16-bit data types. * */ struct rqspinlock_timeout { u64 timeout_end; u64 duration; u64 cur; u16 spin; }; #define RES_TIMEOUT_VAL 2 DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); EXPORT_SYMBOL_GPL(rqspinlock_held_locks); static bool is_lock_released(rqspinlock_t *lock, u32 mask) { if (!(atomic_read_acquire(&lock->val) & (mask))) return true; return false; } static noinline int check_deadlock_AA(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int cnt = min(RES_NR_HELD, rqh->cnt); /* * Return an error if we hold the lock we are attempting to acquire. * We'll iterate over max 32 locks; no need to do is_lock_released. */ for (int i = 0; i < cnt - 1; i++) { if (rqh->locks[i] == lock) return -EDEADLK; } return 0; } /* * This focuses on the most common case of ABBA deadlocks (or ABBA involving * more locks, which reduce to ABBA). This is not exhaustive, and we rely on * timeouts as the final line of defense. */ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); int rqh_cnt = min(RES_NR_HELD, rqh->cnt); void *remote_lock; int cpu; /* * Find the CPU holding the lock that we want to acquire. If there is a * deadlock scenario, we will read a stable set on the remote CPU and * find the target. This would be a constant time operation instead of * O(NR_CPUS) if we could determine the owning CPU from a lock value, but * that requires increasing the size of the lock word. */ for_each_possible_cpu(cpu) { struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); int real_cnt = READ_ONCE(rqh_cpu->cnt); int cnt = min(RES_NR_HELD, real_cnt); /* * Let's ensure to break out of this loop if the lock is available for * us to potentially acquire. */ if (is_lock_released(lock, mask)) return 0; /* * Skip ourselves, and CPUs whose count is less than 2, as they need at * least one held lock and one acquisition attempt (reflected as top * most entry) to participate in an ABBA deadlock. * * If cnt is more than RES_NR_HELD, it means the current lock being * acquired won't appear in the table, and other locks in the table are * already held, so we can't determine ABBA. */ if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) continue; /* * Obtain the entry at the top, this corresponds to the lock the * remote CPU is attempting to acquire in a deadlock situation, * and would be one of the locks we hold on the current CPU. */ remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); /* * If it is NULL, we've raced and cannot determine a deadlock * conclusively, skip this CPU. */ if (!remote_lock) continue; /* * Find if the lock we're attempting to acquire is held by this CPU. * Don't consider the topmost entry, as that must be the latest lock * being held or acquired. For a deadlock, the target CPU must also * attempt to acquire a lock we hold, so for this search only 'cnt - 1' * entries are important. */ for (int i = 0; i < cnt - 1; i++) { if (READ_ONCE(rqh_cpu->locks[i]) != lock) continue; /* * We found our lock as held on the remote CPU. Is the * acquisition attempt on the remote CPU for a lock held * by us? If so, we have a deadlock situation, and need * to recover. */ for (int i = 0; i < rqh_cnt - 1; i++) { if (rqh->locks[i] == remote_lock) return -EDEADLK; } /* * Inconclusive; retry again later. */ return 0; } } return 0; } static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { u64 prev = ts->cur; u64 time; if (!ts->timeout_end) { if (check_deadlock_AA(lock)) return -EDEADLK; ts->cur = ktime_get_mono_fast_ns(); ts->timeout_end = ts->cur + ts->duration; return 0; } time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; /* * A millisecond interval passed from last time? Trigger deadlock * checks. */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; return check_deadlock_ABBA(lock, mask); } return 0; } /* * Do not amortize with spins when res_smp_cond_load_acquire is defined, * as the macro does internal amortization for us. */ #ifndef res_smp_cond_load_acquire #define RES_CHECK_TIMEOUT(ts, ret, mask) \ ({ \ if (!(ts).spin++) \ (ret) = check_timeout((lock), (mask), &(ts)); \ (ret); \ }) #else #define RES_CHECK_TIMEOUT(ts, ret, mask) \ ({ (ret) = check_timeout((lock), (mask), &(ts)); }) #endif /* * Initialize the 'spin' member. * Set spin member to 0 to trigger AA/ABBA checks immediately. */ #define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) /* * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. * Duration is defined for each spin attempt, so set it here. */ #define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) /* * Provide a test-and-set fallback for cases when queued spin lock support is * absent from the architecture. */ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) { struct rqspinlock_timeout ts; int val, ret = 0; RES_INIT_TIMEOUT(ts); /* * We are either called directly from res_spin_lock after grabbing the * deadlock detection entry when queued spinlocks are disabled, or from * resilient_queued_spin_lock_slowpath after grabbing the deadlock * detection entry. No need to obtain it here. */ /* * Since the waiting loop's time is dependent on the amount of * contention, a short timeout unlike rqspinlock waiting loops * isn't enough. Choose a second as the timeout value. */ RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); retry: val = atomic_read(&lock->val); if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) goto out; cpu_relax(); goto retry; } return 0; out: release_held_lock_entry(); return ret; } EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); #ifdef CONFIG_QUEUED_SPINLOCKS /* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * * Exactly fits one 64-byte cacheline on a 64-bit architecture. */ static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); #ifndef res_smp_cond_load_acquire #define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) #endif #define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) /** * resilient_queued_spin_lock_slowpath - acquire the queued spinlock * @lock: Pointer to queued spinlock structure * @val: Current value of the queued spinlock 32-bit word * * Return: * * 0 - Lock was acquired successfully. * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. * * -ETIMEDOUT - Lock acquisition failed because of timeout. * * (queue tail, pending bit, lock value) * * fast : slow : unlock * : : * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) * : | ^--------.------. / : * : v \ \ | : * pending : (0,1,1) +--> (0,1,0) \ | : * : | ^--' | | : * : v | | : * uncontended : (n,x,y) +--> (n,0,0) --' | : * queue : | ^--' | : * : v | : * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; struct rqspinlock_timeout ts; int idx, ret = 0; u32 old, tail; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (resilient_virt_spin_lock_enabled()) return resilient_virt_spin_lock(lock); RES_INIT_TIMEOUT(ts); /* * Wait for in-progress pending->locked hand-overs with a bounded * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { int cnt = _Q_PENDING_LOOPS; val = atomic_cond_read_relaxed(&lock->val, (VAL != _Q_PENDING_VAL) || !cnt--); } /* * If we observe any contention; queue. */ if (val & ~_Q_LOCKED_MASK) goto queue; /* * trylock || pending * * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ val = queued_fetch_set_pending_acquire(lock); /* * If we observe contention, there is a concurrent locker. * * Undo and queue; our setting of PENDING might have made the * n,0,0 -> 0,0,0 transition fail and it will now be waiting * on @next to become !NULL. */ if (unlikely(val & ~_Q_LOCKED_MASK)) { /* Undo PENDING if we set it. */ if (!(val & _Q_PENDING_MASK)) clear_pending(lock); goto queue; } /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. * * 0,1,1 -> *,1,0 * * this wait loop must be a load-acquire such that we match the * store-release that clears the locked bit and create lock * sequentiality; this is because not all * clear_pending_set_locked() implementations imply full * barriers. */ if (val & _Q_LOCKED_MASK) { RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); } if (ret) { /* * We waited for the locked bit to go back to 0, as the pending * waiter, but timed out. We need to clear the pending bit since * we own it. Once a stuck owner has been recovered, the lock * must be restored to a valid state, hence removing the pending * bit is necessary. * * *,1,* -> *,0,* */ clear_pending(lock); lockevent_inc(rqspinlock_lock_timeout); goto err_release_entry; } /* * take ownership and clear the pending bit. * * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); lockevent_inc(lock_pending); return 0; /* * End of pending bit optimistic spinning and beginning of MCS * queuing. */ queue: /* * Do not queue if we're a waiter and someone is attempting this lock on * the same CPU. In case of NMIs, this prevents long timeouts where we * interrupt the pending waiter, and the owner, that will eventually * signal the head of our queue, both of which are logically but not * physically part of the queue, hence outside the scope of the idx > 0 * check above for the trylock fallback. */ if (check_deadlock_AA(lock)) { ret = -EDEADLK; goto err_release_entry; } lockevent_inc(lock_slowpath); /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); trace_contention_begin(lock, LCB_F_SPIN); /* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, * we fall back to attempting a trylock operation without using * any MCS node. Unlike qspinlock which cannot fail, we have the * option of failing the slow path, and under contention, such a * trylock spinning will likely be treated unfairly due to lack of * queueing, hence do not spin. */ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); if (!queued_spin_trylock(lock)) { ret = -EDEADLK; goto err_release_node; } goto release; } node = grab_mcs_node(node, idx); /* * Keep counts of non-zero index values: */ lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising * the actual node. If the compiler is kind enough to reorder these * stores, then an IRQ could overwrite our assignments. */ barrier(); node->locked = 0; node->next = NULL; /* * We touched a (possibly) cold cacheline in the per-cpu queue node; * attempt the trylock once more in the hope someone let go while we * weren't watching. */ if (queued_spin_trylock(lock)) goto release; /* * Ensure that the initialisation of @node is complete before we * publish the updated tail via xchg_tail() and potentially link * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. */ smp_wmb(); /* * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* */ old = xchg_tail(lock, tail); next = NULL; /* * if there was a previous node; link it and wait until reaching the * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { int val; prev = decode_tail(old, rqnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); val = arch_mcs_spin_lock_contended(&node->locked); if (val == RES_TIMEOUT_VAL) { ret = -ETIMEDOUT; goto waitq_timeout; } /* * While waiting for the MCS lock, the next pointer may have * been set by another lock waiter. We optimistically load * the next pointer & prefetch the cacheline for writing * to reduce latency in the upcoming MCS unlock operation. */ next = READ_ONCE(node->next); if (next) prefetchw(next); } /* * we're at the head of the waitqueue, wait for the owner & pending to * go away. * * *,x,y -> *,0,0 * * this wait loop must use a load-acquire such that we match the * store-release that clears the locked bit and create lock * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is * meant to span maximum allowed time per critical section, and we may * have both the owner of the lock and the pending bit waiter ahead of * us. */ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); /* Disable queue destruction when we detect deadlocks. */ if (ret == -EDEADLK) { if (!next) next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); goto err_release_node; } waitq_timeout: if (ret) { /* * If the tail is still pointing to us, then we are the final waiter, * and are responsible for resetting the tail back to 0. Otherwise, if * the cmpxchg operation fails, we signal the next waiter to take exit * and try the same. For a waiter with tail node 'n': * * n,*,* -> 0,*,* * * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is * possible locked/pending bits keep changing and we see failures even * when we remain the head of wait queue. However, eventually, * pending bit owner will unset the pending bit, and new waiters * will queue behind us. This will leave the lock owner in * charge, and it will eventually either set locked bit to 0, or * leave it as 1, allowing us to make progress. * * We terminate the whole wait queue for two reasons. Firstly, * we eschew per-waiter timeouts with one applied at the head of * the wait queue. This allows everyone to break out faster * once we've seen the owner / pending waiter not responding for * the timeout duration from the head. Secondly, it avoids * complicated synchronization, because when not leaving in FIFO * order, prev's next pointer needs to be fixed up etc. */ if (!try_cmpxchg_tail(lock, tail, 0)) { next = smp_cond_load_relaxed(&node->next, VAL); WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); } lockevent_inc(rqspinlock_lock_timeout); goto err_release_node; } /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended * *,*,0 -> *,*,1 : lock, contended * * If the queue head is the only one in the queue (lock value == tail) * and nobody is pending, clear the tail code and grab the lock. * Otherwise, we only need to grab the lock. */ /* * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the * above wait condition, therefore any concurrent setting of * PENDING will make the uncontended transition fail. */ if ((val & _Q_TAIL_MASK) == tail) { if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) goto release; /* No contention */ } /* * Either somebody is queued behind us or _Q_PENDING_VAL got set * which will then detect the remaining tail and queue behind us * ensuring we'll see a @next. */ set_locked(lock); /* * contended path; wait for next if not observed yet, release. */ if (!next) next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); release: trace_contention_end(lock, 0); /* * release the node */ __this_cpu_dec(rqnodes[0].mcs.count); return ret; err_release_node: trace_contention_end(lock, ret); __this_cpu_dec(rqnodes[0].mcs.count); err_release_entry: release_held_lock_entry(); return ret; } EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); #endif /* CONFIG_QUEUED_SPINLOCKS */ __bpf_kfunc_start_defs(); static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); struct bpf_stream_stage ss; struct bpf_prog *prog; prog = bpf_prog_find_from_stack(); if (!prog) return; bpf_stream_stage(ss, prog, BPF_STDERR, ({ bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); bpf_stream_dump_stack(ss); })); } #define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } return 0; } __bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) { res_spin_unlock((rqspinlock_t *)lock); preempt_enable(); } __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) { u64 *ptr = (u64 *)flags__irq_flag; unsigned long flags; int ret; preempt_disable(); local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; } *ptr = flags; return 0; } __bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) { u64 *ptr = (u64 *)flags__irq_flag; unsigned long flags = *ptr; res_spin_unlock((rqspinlock_t *)lock); local_irq_restore(flags); preempt_enable(); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(rqspinlock_kfunc_ids) BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_res_spin_unlock) BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) BTF_KFUNCS_END(rqspinlock_kfunc_ids) static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { .owner = THIS_MODULE, .set = &rqspinlock_kfunc_ids, }; static __init int rqspinlock_register_kfuncs(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); } late_initcall(rqspinlock_register_kfuncs);
4 4 4 4 4 4 4 16 16 16 16 16 16 19 19 19 19 19 3 17 3 16 16 19 7 3 6 1 3 4 6 6 6 2 2 2 2 2 3 4 1 3 3 1 2 1 2 1 2 6 6 2 2 2 2 2 2 8 2 1 2 1 2 7 2 2 2 2 1 2 7 2 3 2 3 3 5 5 5 5 2 2 5 3 3 3 3 5 5 5 5 2 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 // SPDX-License-Identifier: GPL-2.0-only /* * HWSIM IEEE 802.15.4 interface * * (C) 2018 Mojatau, Alexander Aring <aring@mojatau.com> * Copyright 2007-2012 Siemens AG * * Based on fakelb, original Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/module.h> #include <linux/timer.h> #include <linux/platform_device.h> #include <linux/rtnetlink.h> #include <linux/netdevice.h> #include <linux/device.h> #include <linux/spinlock.h> #include <net/ieee802154_netdev.h> #include <net/mac802154.h> #include <net/cfg802154.h> #include <net/genetlink.h> #include "mac802154_hwsim.h" MODULE_DESCRIPTION("Software simulator of IEEE 802.15.4 radio(s) for mac802154"); MODULE_LICENSE("GPL"); static LIST_HEAD(hwsim_phys); static DEFINE_MUTEX(hwsim_phys_lock); static struct platform_device *mac802154hwsim_dev; /* MAC802154_HWSIM netlink family */ static struct genl_family hwsim_genl_family; static int hwsim_radio_idx; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, }; static const struct genl_multicast_group hwsim_mcgrps[] = { [HWSIM_MCGRP_CONFIG] = { .name = "config", }, }; struct hwsim_pib { u8 page; u8 channel; struct ieee802154_hw_addr_filt filt; enum ieee802154_filtering_level filt_level; struct rcu_head rcu; }; struct hwsim_edge_info { u8 lqi; struct rcu_head rcu; }; struct hwsim_edge { struct hwsim_phy *endpoint; struct hwsim_edge_info __rcu *info; struct list_head list; struct rcu_head rcu; }; struct hwsim_phy { struct ieee802154_hw *hw; u32 idx; struct hwsim_pib __rcu *pib; bool suspended; struct list_head edges; struct list_head list; }; static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init); static void hwsim_del(struct hwsim_phy *phy); static int hwsim_hw_ed(struct ieee802154_hw *hw, u8 *level) { *level = 0xbe; return 0; } static int hwsim_update_pib(struct ieee802154_hw *hw, u8 page, u8 channel, struct ieee802154_hw_addr_filt *filt, enum ieee802154_filtering_level filt_level) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib, *pib_old; pib = kzalloc_obj(*pib, GFP_ATOMIC); if (!pib) return -ENOMEM; pib_old = rtnl_dereference(phy->pib); pib->page = page; pib->channel = channel; pib->filt.short_addr = filt->short_addr; pib->filt.pan_id = filt->pan_id; pib->filt.ieee_addr = filt->ieee_addr; pib->filt.pan_coord = filt->pan_coord; pib->filt_level = filt_level; rcu_assign_pointer(phy->pib, pib); kfree_rcu(pib_old, rcu); return 0; } static int hwsim_hw_channel(struct ieee802154_hw *hw, u8 page, u8 channel) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, page, channel, &pib->filt, pib->filt_level); rcu_read_unlock(); return ret; } static int hwsim_hw_addr_filt(struct ieee802154_hw *hw, struct ieee802154_hw_addr_filt *filt, unsigned long changed) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, filt, pib->filt_level); rcu_read_unlock(); return ret; } static void hwsim_hw_receive(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_hdr hdr; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; rcu_read_lock(); pib = rcu_dereference(phy->pib); if (!pskb_may_pull(skb, 3)) { dev_dbg(hw->parent, "invalid frame\n"); goto drop; } memcpy(&hdr, skb->data, 3); /* Level 4 filtering: Frame fields validity */ if (pib->filt_level == IEEE802154_FILTERING_4_FRAME_FIELDS) { /* a) Drop reserved frame types */ switch (mac_cb(skb)->type) { case IEEE802154_FC_TYPE_BEACON: case IEEE802154_FC_TYPE_DATA: case IEEE802154_FC_TYPE_ACK: case IEEE802154_FC_TYPE_MAC_CMD: break; default: dev_dbg(hw->parent, "unrecognized frame type 0x%x\n", mac_cb(skb)->type); goto drop; } /* b) Drop reserved frame versions */ switch (hdr.fc.version) { case IEEE802154_2003_STD: case IEEE802154_2006_STD: case IEEE802154_STD: break; default: dev_dbg(hw->parent, "unrecognized frame version 0x%x\n", hdr.fc.version); goto drop; } /* c) PAN ID constraints */ if ((mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG || mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id && mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) { dev_dbg(hw->parent, "unrecognized PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } /* d1) Short address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT && mac_cb(skb)->dest.short_addr != pib->filt.short_addr && mac_cb(skb)->dest.short_addr != cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { dev_dbg(hw->parent, "unrecognized short address %04x\n", le16_to_cpu(mac_cb(skb)->dest.short_addr)); goto drop; } /* d2) Extended address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG && mac_cb(skb)->dest.extended_addr != pib->filt.ieee_addr) { dev_dbg(hw->parent, "unrecognized long address 0x%016llx\n", mac_cb(skb)->dest.extended_addr); goto drop; } /* d4) Specific PAN coordinator case (no parent) */ if ((mac_cb(skb)->type == IEEE802154_FC_TYPE_DATA || mac_cb(skb)->type == IEEE802154_FC_TYPE_MAC_CMD) && mac_cb(skb)->dest.mode == IEEE802154_ADDR_NONE) { dev_dbg(hw->parent, "relaying is not supported\n"); goto drop; } /* e) Beacon frames follow specific PAN ID rules */ if (mac_cb(skb)->type == IEEE802154_FC_TYPE_BEACON && pib->filt.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id) { dev_dbg(hw->parent, "invalid beacon PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } } rcu_read_unlock(); ieee802154_rx_irqsafe(hw, skb, lqi); return; drop: rcu_read_unlock(); kfree_skb(skb); } static int hwsim_hw_xmit(struct ieee802154_hw *hw, struct sk_buff *skb) { struct hwsim_phy *current_phy = hw->priv; struct hwsim_pib *current_pib, *endpoint_pib; struct hwsim_edge_info *einfo; struct hwsim_edge *e; WARN_ON(current_phy->suspended); rcu_read_lock(); current_pib = rcu_dereference(current_phy->pib); list_for_each_entry_rcu(e, &current_phy->edges, list) { /* Can be changed later in rx_irqsafe, but this is only a * performance tweak. Received radio should drop the frame * in mac802154 stack anyway... so we don't need to be * 100% of locking here to check on suspended */ if (e->endpoint->suspended) continue; endpoint_pib = rcu_dereference(e->endpoint->pib); if (current_pib->page == endpoint_pib->page && current_pib->channel == endpoint_pib->channel) { struct sk_buff *newskb = pskb_copy(skb, GFP_ATOMIC); einfo = rcu_dereference(e->info); if (newskb) hwsim_hw_receive(e->endpoint->hw, newskb, einfo->lqi); } } rcu_read_unlock(); ieee802154_xmit_complete(hw, skb, false); return 0; } static int hwsim_hw_start(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = false; return 0; } static void hwsim_hw_stop(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = true; } static int hwsim_set_promiscuous_mode(struct ieee802154_hw *hw, const bool on) { enum ieee802154_filtering_level filt_level; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; if (on) filt_level = IEEE802154_FILTERING_NONE; else filt_level = IEEE802154_FILTERING_4_FRAME_FIELDS; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, &pib->filt, filt_level); rcu_read_unlock(); return ret; } static const struct ieee802154_ops hwsim_ops = { .owner = THIS_MODULE, .xmit_async = hwsim_hw_xmit, .ed = hwsim_hw_ed, .set_channel = hwsim_hw_channel, .start = hwsim_hw_start, .stop = hwsim_hw_stop, .set_promiscuous_mode = hwsim_set_promiscuous_mode, .set_hw_addr_filt = hwsim_hw_addr_filt, }; static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { return hwsim_add_one(info, &mac802154hwsim_dev->dev, false); } static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy, *tmp; s64 idx = -1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) { if (idx == phy->idx) { hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return 0; } } mutex_unlock(&hwsim_phys_lock); return -ENODEV; } static int append_radio_msg(struct sk_buff *skb, struct hwsim_phy *phy) { struct nlattr *nl_edges, *nl_edge; struct hwsim_edge_info *einfo; struct hwsim_edge *e; int ret; ret = nla_put_u32(skb, MAC802154_HWSIM_ATTR_RADIO_ID, phy->idx); if (ret < 0) return ret; rcu_read_lock(); if (list_empty(&phy->edges)) { rcu_read_unlock(); return 0; } nl_edges = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGES); if (!nl_edges) { rcu_read_unlock(); return -ENOBUFS; } list_for_each_entry_rcu(e, &phy->edges, list) { nl_edge = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGE); if (!nl_edge) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edges); return -ENOBUFS; } ret = nla_put_u32(skb, MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID, e->endpoint->idx); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } einfo = rcu_dereference(e->info); ret = nla_put_u8(skb, MAC802154_HWSIM_EDGE_ATTR_LQI, einfo->lqi); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } nla_nest_end(skb, nl_edge); } rcu_read_unlock(); nla_nest_end(skb, nl_edges); return 0; } static int hwsim_get_radio(struct sk_buff *skb, struct hwsim_phy *phy, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; int res; hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags, MAC802154_HWSIM_CMD_GET_RADIO); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); res = append_radio_msg(skb, phy); if (res < 0) goto out_err; genlmsg_end(skb, hdr); return 0; out_err: genlmsg_cancel(skb, hdr); return res; } static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy; struct sk_buff *skb; int idx, res = -ENODEV; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx != idx) continue; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) { res = -ENOMEM; goto out_err; } res = hwsim_get_radio(skb, phy, info->snd_portid, info->snd_seq, NULL, 0); if (res < 0) { nlmsg_free(skb); goto out_err; } res = genlmsg_reply(skb, info); break; } out_err: mutex_unlock(&hwsim_phys_lock); return res; } static int hwsim_dump_radio_nl(struct sk_buff *skb, struct netlink_callback *cb) { int idx = cb->args[0]; struct hwsim_phy *phy; int res; mutex_lock(&hwsim_phys_lock); if (idx == hwsim_radio_idx) goto done; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx < idx) continue; res = hwsim_get_radio(skb, phy, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (res < 0) break; idx = phy->idx + 1; } cb->args[0] = idx; done: mutex_unlock(&hwsim_phys_lock); return skb->len; } /* caller need to held hwsim_phys_lock */ static struct hwsim_phy *hwsim_get_radio_by_id(uint32_t idx) { struct hwsim_phy *phy; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx == idx) return phy; } return NULL; } static const struct nla_policy hwsim_edge_policy[MAC802154_HWSIM_EDGE_ATTR_MAX + 1] = { [MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_EDGE_ATTR_LQI] = { .type = NLA_U8 }, }; static struct hwsim_edge *hwsim_alloc_edge(struct hwsim_phy *endpoint, u8 lqi) { struct hwsim_edge_info *einfo; struct hwsim_edge *e; e = kzalloc_obj(*e); if (!e) return NULL; einfo = kzalloc_obj(*einfo); if (!einfo) { kfree(e); return NULL; } einfo->lqi = 0xff; rcu_assign_pointer(e->info, einfo); e->endpoint = endpoint; return e; } static void hwsim_free_edge(struct hwsim_edge *e) { struct hwsim_edge_info *einfo; rcu_read_lock(); einfo = rcu_dereference(e->info); rcu_read_unlock(); kfree_rcu(einfo, rcu); kfree_rcu(e, rcu); } static int hwsim_new_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0, *phy_v1; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); if (v0 == v1) return -EINVAL; mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } phy_v1 = hwsim_get_radio_by_id(v1); if (!phy_v1) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { mutex_unlock(&hwsim_phys_lock); rcu_read_unlock(); return -EEXIST; } } rcu_read_unlock(); e = hwsim_alloc_edge(phy_v1, 0xff); if (!e) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } list_add_rcu(&e->list, &phy_v0->edges); /* wait until changes are done under hwsim_phys_lock lock * should prevent of calling this function twice while * edges list has not the changes yet. */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { rcu_read_unlock(); list_del_rcu(&e->list); hwsim_free_edge(e); /* same again - wait until list changes are done */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; u8 lqi; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] || !edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); lqi = nla_get_u8(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } einfo = kzalloc_obj(*einfo); if (!einfo) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; einfo_old = rcu_replace_pointer(e->info, einfo, lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); kfree(einfo); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } /* MAC802154_HWSIM netlink policy */ static const struct nla_policy hwsim_genl_policy[MAC802154_HWSIM_ATTR_MAX + 1] = { [MAC802154_HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_ATTR_RADIO_EDGE] = { .type = NLA_NESTED }, [MAC802154_HWSIM_ATTR_RADIO_EDGES] = { .type = NLA_NESTED }, }; /* Generic Netlink operations array */ static const struct genl_small_ops hwsim_nl_ops[] = { { .cmd = MAC802154_HWSIM_CMD_NEW_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_GET_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_get_radio_nl, .dumpit = hwsim_dump_radio_nl, }, { .cmd = MAC802154_HWSIM_CMD_NEW_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_SET_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_set_edge_lqi, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC802154_HWSIM", .version = 1, .maxattr = MAC802154_HWSIM_ATTR_MAX, .policy = hwsim_genl_policy, .module = THIS_MODULE, .small_ops = hwsim_nl_ops, .n_small_ops = ARRAY_SIZE(hwsim_nl_ops), .resv_start_op = MAC802154_HWSIM_CMD_NEW_EDGE + 1, .mcgrps = hwsim_mcgrps, .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb, struct genl_info *info) { if (info) genl_notify(&hwsim_genl_family, mcast_skb, info, HWSIM_MCGRP_CONFIG, GFP_KERNEL); else genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0, HWSIM_MCGRP_CONFIG, GFP_KERNEL); } static void hwsim_mcast_new_radio(struct genl_info *info, struct hwsim_phy *phy) { struct sk_buff *mcast_skb; void *data; mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!mcast_skb) return; data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0, MAC802154_HWSIM_CMD_NEW_RADIO); if (!data) goto out_err; if (append_radio_msg(mcast_skb, phy) < 0) goto out_err; genlmsg_end(mcast_skb, data); hwsim_mcast_config_msg(mcast_skb, info); return; out_err: genlmsg_cancel(mcast_skb, data); nlmsg_free(mcast_skb); } static void hwsim_edge_unsubscribe_me(struct hwsim_phy *phy) { struct hwsim_phy *tmp; struct hwsim_edge *e; rcu_read_lock(); /* going to all phy edges and remove phy from it */ list_for_each_entry(tmp, &hwsim_phys, list) { list_for_each_entry_rcu(e, &tmp->edges, list) { if (e->endpoint->idx == phy->idx) { list_del_rcu(&e->list); hwsim_free_edge(e); } } } rcu_read_unlock(); synchronize_rcu(); } static int hwsim_subscribe_all_others(struct hwsim_phy *phy) { struct hwsim_phy *sub; struct hwsim_edge *e; list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(sub, 0xff); if (!e) goto me_fail; list_add_rcu(&e->list, &phy->edges); } list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(phy, 0xff); if (!e) goto sub_fail; list_add_rcu(&e->list, &sub->edges); } return 0; sub_fail: hwsim_edge_unsubscribe_me(phy); me_fail: rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } rcu_read_unlock(); return -ENOMEM; } static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init) { struct ieee802154_hw *hw; struct hwsim_phy *phy; struct hwsim_pib *pib; int idx; int err; idx = hwsim_radio_idx++; hw = ieee802154_alloc_hw(sizeof(*phy), &hwsim_ops); if (!hw) return -ENOMEM; phy = hw->priv; phy->hw = hw; /* 868 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 1; /* 915 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7fe; /* 2.4 GHz O-QPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7FFF800; /* 868 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 1; /* 915 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 0x7fe; /* 868 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 1; /* 915 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 0x7fe; /* 2.4 GHz CSS 802.15.4a-2007 */ hw->phy->supported.channels[3] |= 0x3fff; /* UWB Sub-gigahertz 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 1; /* UWB Low band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0x1e; /* UWB High band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0xffe0; /* 750 MHz O-QPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf; /* 750 MHz MPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf0; /* 950 MHz BPSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ff; /* 950 MHz GFSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ffc00; ieee802154_random_extended_addr(&hw->phy->perm_extended_addr); /* hwsim phy channel 13 as default */ hw->phy->current_channel = 13; pib = kzalloc_obj(*pib); if (!pib) { err = -ENOMEM; goto err_pib; } pib->channel = 13; pib->filt.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); pib->filt.pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); rcu_assign_pointer(phy->pib, pib); phy->idx = idx; INIT_LIST_HEAD(&phy->edges); hw->flags = IEEE802154_HW_PROMISCUOUS; hw->parent = dev; err = ieee802154_register_hw(hw); if (err) goto err_reg; mutex_lock(&hwsim_phys_lock); if (init) { err = hwsim_subscribe_all_others(phy); if (err < 0) { mutex_unlock(&hwsim_phys_lock); goto err_subscribe; } } list_add_tail(&phy->list, &hwsim_phys); mutex_unlock(&hwsim_phys_lock); hwsim_mcast_new_radio(info, phy); return idx; err_subscribe: ieee802154_unregister_hw(phy->hw); err_reg: kfree(pib); err_pib: ieee802154_free_hw(phy->hw); return err; } static void hwsim_del(struct hwsim_phy *phy) { struct hwsim_pib *pib; struct hwsim_edge *e; hwsim_edge_unsubscribe_me(phy); list_del(&phy->list); rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } pib = rcu_dereference(phy->pib); rcu_read_unlock(); kfree_rcu(pib, rcu); ieee802154_unregister_hw(phy->hw); ieee802154_free_hw(phy->hw); } static int hwsim_probe(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; int err, i; for (i = 0; i < 2; i++) { err = hwsim_add_one(NULL, &pdev->dev, true); if (err < 0) goto err_slave; } dev_info(&pdev->dev, "Added 2 mac802154 hwsim hardware radios\n"); return 0; err_slave: mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return err; } static void hwsim_remove(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); } static struct platform_driver mac802154hwsim_driver = { .probe = hwsim_probe, .remove = hwsim_remove, .driver = { .name = "mac802154_hwsim", }, }; static __init int hwsim_init_module(void) { int rc; rc = genl_register_family(&hwsim_genl_family); if (rc) return rc; mac802154hwsim_dev = platform_device_register_simple("mac802154_hwsim", -1, NULL, 0); if (IS_ERR(mac802154hwsim_dev)) { rc = PTR_ERR(mac802154hwsim_dev); goto platform_dev; } rc = platform_driver_register(&mac802154hwsim_driver); if (rc < 0) goto platform_drv; return 0; platform_drv: platform_device_unregister(mac802154hwsim_dev); platform_dev: genl_unregister_family(&hwsim_genl_family); return rc; } static __exit void hwsim_remove_module(void) { genl_unregister_family(&hwsim_genl_family); platform_driver_unregister(&mac802154hwsim_driver); platform_device_unregister(mac802154hwsim_dev); } module_init(hwsim_init_module); module_exit(hwsim_remove_module);
2 1 1 2 4 1 2 1 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 // SPDX-License-Identifier: GPL-2.0-only /* * Generic netlink handshake service * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2023, Oracle and/or its affiliates. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/mm.h> #include <net/sock.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <kunit/visibility.h> #include <uapi/linux/handshake.h> #include "handshake.h" #include "genl.h" #include <trace/events/handshake.h> /** * handshake_genl_notify - Notify handlers that a request is waiting * @net: target network namespace * @proto: handshake protocol * @flags: memory allocation control flags * * Returns zero on success or a negative errno if notification failed. */ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, gfp_t flags) { struct sk_buff *msg; void *hdr; /* Disable notifications during unit testing */ if (!test_bit(HANDSHAKE_F_PROTO_NOTIFY, &proto->hp_flags)) return 0; if (!genl_has_listeners(&handshake_nl_family, net, proto->hp_handler_class)) return -ESRCH; msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, flags); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0, HANDSHAKE_CMD_READY); if (!hdr) goto out_free; if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS, proto->hp_handler_class) < 0) { genlmsg_cancel(msg, hdr); goto out_free; } genlmsg_end(msg, hdr); return genlmsg_multicast_netns(&handshake_nl_family, net, msg, 0, proto->hp_handler_class, flags); out_free: nlmsg_free(msg); return -EMSGSIZE; } /** * handshake_genl_put - Create a generic netlink message header * @msg: buffer in which to create the header * @info: generic netlink message context * * Returns a ready-to-use header, or NULL. */ struct nlmsghdr *handshake_genl_put(struct sk_buff *msg, struct genl_info *info) { return genlmsg_put(msg, info->snd_portid, info->snd_seq, &handshake_nl_family, 0, info->genlhdr->cmd); } EXPORT_SYMBOL(handshake_genl_put); int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; struct socket *sock; int class, err; err = -EOPNOTSUPP; if (!hn) goto out_status; err = -EINVAL; if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS)) goto out_status; class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]); err = -EAGAIN; req = handshake_req_next(hn, class); if (req) { sock = req->hr_sk->sk_socket; FD_PREPARE(fdf, O_CLOEXEC, sock->file); if (fdf.err) { err = fdf.err; goto out_complete; } get_file(sock->file); /* FD_PREPARE() consumes a reference. */ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); if (err) goto out_complete; /* Automatic cleanup handles fput */ trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf)); fd_publish(fdf); return 0; } out_complete: if (req) handshake_complete(req, -EIO, NULL); out_status: trace_handshake_cmd_accept_err(net, req, NULL, err); return err; } int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct handshake_req *req; struct socket *sock; int fd, status, err; if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD)) return -EINVAL; fd = nla_get_s32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]); sock = sockfd_lookup(fd, &err); if (!sock) return err; req = handshake_req_hash_lookup(sock->sk); if (!req) { err = -EBUSY; trace_handshake_cmd_done_err(net, req, sock->sk, err); sockfd_put(sock); return err; } trace_handshake_cmd_done(net, req, sock->sk, fd); status = -EIO; if (info->attrs[HANDSHAKE_A_DONE_STATUS]) status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); handshake_complete(req, status, info); sockfd_put(sock); return 0; } static unsigned int handshake_net_id; static int __net_init handshake_net_init(struct net *net) { struct handshake_net *hn = net_generic(net, handshake_net_id); unsigned long tmp; struct sysinfo si; /* * Arbitrary limit to prevent handshakes that do not make * progress from clogging up the system. The cap scales up * with the amount of physical memory on the system. */ si_meminfo(&si); tmp = si.totalram / (25 * si.mem_unit); hn->hn_pending_max = clamp(tmp, 3UL, 50UL); spin_lock_init(&hn->hn_lock); hn->hn_pending = 0; hn->hn_flags = 0; INIT_LIST_HEAD(&hn->hn_requests); return 0; } static void __net_exit handshake_net_exit(struct net *net) { struct handshake_net *hn = net_generic(net, handshake_net_id); struct handshake_req *req; LIST_HEAD(requests); /* * Drain the net's pending list. Requests that have been * accepted and are in progress will be destroyed when * the socket is closed. */ spin_lock(&hn->hn_lock); set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags); list_splice_init(&requests, &hn->hn_requests); spin_unlock(&hn->hn_lock); while (!list_empty(&requests)) { req = list_first_entry(&requests, struct handshake_req, hr_list); list_del(&req->hr_list); /* * Requests on this list have not yet been * accepted, so they do not have an fd to put. */ handshake_complete(req, -ETIMEDOUT, NULL); } } static struct pernet_operations handshake_genl_net_ops = { .init = handshake_net_init, .exit = handshake_net_exit, .id = &handshake_net_id, .size = sizeof(struct handshake_net), }; /** * handshake_pernet - Get the handshake private per-net structure * @net: network namespace * * Returns a pointer to the net's private per-net structure for the * handshake module, or NULL if handshake_init() failed. */ struct handshake_net *handshake_pernet(struct net *net) { return handshake_net_id ? net_generic(net, handshake_net_id) : NULL; } EXPORT_SYMBOL_IF_KUNIT(handshake_pernet); static int __init handshake_init(void) { int ret; ret = handshake_req_hash_init(); if (ret) { pr_warn("handshake: hash initialization failed (%d)\n", ret); return ret; } ret = genl_register_family(&handshake_nl_family); if (ret) { pr_warn("handshake: netlink registration failed (%d)\n", ret); handshake_req_hash_destroy(); return ret; } /* * ORDER: register_pernet_subsys must be done last. * * If initialization does not make it past pernet_subsys * registration, then handshake_net_id will remain 0. That * shunts the handshake consumer API to return ENOTSUPP * to prevent it from dereferencing something that hasn't * been allocated. */ ret = register_pernet_subsys(&handshake_genl_net_ops); if (ret) { pr_warn("handshake: pernet registration failed (%d)\n", ret); genl_unregister_family(&handshake_nl_family); handshake_req_hash_destroy(); } return ret; } static void __exit handshake_exit(void) { unregister_pernet_subsys(&handshake_genl_net_ops); handshake_net_id = 0; handshake_req_hash_destroy(); genl_unregister_family(&handshake_nl_family); } module_init(handshake_init); module_exit(handshake_exit);
142 142 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling core * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/route.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } static void fake_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } static unsigned int fake_mtu(const struct dst_entry *dst) { return dst->dev->mtu; } static struct dst_ops fake_dst_ops = { .family = AF_INET, .update_pmtu = fake_update_pmtu, .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, .mtu = fake_mtu, }; /* * Initialize bogus route table used to keep netfilter happy. * Currently, we fill in the PMTU entry because netfilter * refragmentation needs it, and the rt_flags entry because * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; rcuref_init(&rt->dst.__rcuref, 1); rt->dst.dev = br->dev; dst_init_metrics(&rt->dst, br->metrics, false); dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } int __init br_nf_core_init(void) { return dst_entries_init(&fake_dst_ops); } void br_nf_core_fini(void) { dst_entries_destroy(&fake_dst_ops); }
2 2 2 2 2 48 9 35 54 5 7 1 1 3 4 3 2 1 1 2 2 10 1 1 1 1 1 1 2 2 4 1 2 1 4 4 4 1 2 2 2 2 1 10 10 10 6 1 1 2 1 4 1 4 1 3 2 3 2 2 3 5 5 5 5 6 27 3 14 4 1 2 3 8 3 2 6 2 2 2 2 4 5 5 2 18 18 18 4 12 14 10 4 6 4 4 5 5 8 2 10 6 6 6 6 6 49 49 48 48 2 2 2 3 1 2 7 6 1 7 1 1 1 1 1 1 2 3 2 10 10 4 12 1 11 3 2 1 4 1 2 2 1 2 8 1 5 2 9 1 8 8 5 3 8 1 1 1 1 2 3 5 6 6 7 1 6 4 4 4 1 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/sock.h> #include <linux/nl802154.h> #include <linux/export.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154.h" static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr, int padattr) { return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr), padattr); } static __le64 nla_get_hwaddr(const struct nlattr *nla) { return ieee802154_devaddr_from_raw(nla_data(nla)); } static int nla_put_shortaddr(struct sk_buff *msg, int type, __le16 addr) { return nla_put_u16(msg, type, le16_to_cpu(addr)); } static __le16 nla_get_shortaddr(const struct nlattr *nla) { return cpu_to_le16(nla_get_u16(nla)); } static int ieee802154_nl_start_confirm(struct net_device *dev, u8 status) { struct sk_buff *msg; pr_debug("%s\n", __func__); msg = ieee802154_nl_create(0, IEEE802154_START_CONF); if (!msg) return -ENOBUFS; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || nla_put_u8(msg, IEEE802154_ATTR_STATUS, status)) goto nla_put_failure; return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP); nla_put_failure: nlmsg_free(msg); return -ENOBUFS; } static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct net_device *dev) { void *hdr; struct wpan_phy *phy; struct ieee802154_mlme_ops *ops; __le16 short_addr, pan_id; pr_debug("%s\n", __func__); hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, IEEE802154_LIST_IFACE); if (!hdr) goto out; ops = ieee802154_mlme_ops(dev); phy = dev->ieee802154_ptr->wpan_phy; BUG_ON(!phy); get_device(&phy->dev); rtnl_lock(); short_addr = dev->ieee802154_ptr->short_addr; pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) || nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, pan_id)) goto nla_put_failure; if (ops->get_mac_params) { struct ieee802154_mac_params params; rtnl_lock(); ops->get_mac_params(dev, &params); rtnl_unlock(); if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, params.transmit_power / 100) || nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, params.cca_ed_level / 100) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, params.csma_retries) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, params.min_be) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MAX_BE, params.max_be) || nla_put_s8(msg, IEEE802154_ATTR_FRAME_RETRIES, params.frame_retries)) goto nla_put_failure; } wpan_phy_put(phy); genlmsg_end(msg, hdr); return 0; nla_put_failure: wpan_phy_put(phy); genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } /* Requests from userspace */ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) { struct net_device *dev; if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], sizeof(name)); dev = dev_get_by_name(&init_net, name); } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { dev = dev_get_by_index(&init_net, nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX])); } else { return NULL; } if (!dev) return NULL; if (dev->type != ARPHRD_IEEE802154) { dev_put(dev); return NULL; } return dev; } int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; u8 page; int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_CHANNEL] || !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] && !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_CAPABILITY]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->assoc_req) goto out; if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) { addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]); } else { addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); } addr.pan_id = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]), page, nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY])); out: dev_put(dev); return ret; } int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_STATUS] || !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] || !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->assoc_resp) goto out; addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); rtnl_lock(); addr.pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS])); out: dev_put(dev); return ret; } int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; int ret = -EOPNOTSUPP; if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_REASON]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->disassoc_req) goto out; if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) { addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); } else { addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } rtnl_lock(); addr.pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); out: dev_put(dev); return ret; } /* PANid, channel, beacon_order = 15, superframe_order = 15, * PAN_coordinator, battery_life_extension = 0, * coord_realignment = 0, security_enable = 0 */ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; u8 channel, bcn_ord, sf_ord; u8 page; int pan_coord, blx, coord_realign; int ret = -EBUSY; if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] || !info->attrs[IEEE802154_ATTR_CHANNEL] || !info->attrs[IEEE802154_ATTR_BCN_ORD] || !info->attrs[IEEE802154_ATTR_SF_ORD] || !info->attrs[IEEE802154_ATTR_PAN_COORD] || !info->attrs[IEEE802154_ATTR_BAT_EXT] || !info->attrs[IEEE802154_ATTR_COORD_REALIGN] ) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (netif_running(dev)) goto out; if (!ieee802154_mlme_ops(dev)->start_req) { ret = -EOPNOTSUPP; goto out; } addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); addr.pan_id = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]); bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]); sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]); pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]); blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]); coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS); dev_put(dev); return -EINVAL; } rtnl_lock(); ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page, bcn_ord, sf_ord, pan_coord, blx, coord_realign); rtnl_unlock(); /* FIXME: add validation for unused parameters to be sane * for SoftMAC */ ieee802154_nl_start_confirm(dev, IEEE802154_SUCCESS); out: dev_put(dev); return ret; } int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; int ret = -EOPNOTSUPP; u8 type; u32 channels; u8 duration; u8 page; if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] || !info->attrs[IEEE802154_ATTR_CHANNELS] || !info->attrs[IEEE802154_ATTR_DURATION]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->scan_req) goto out; type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]); channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]); duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, duration); out: dev_put(dev); return ret; } int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, * PAN Id, short address */ struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq, 0, dev); if (rc < 0) goto out_free; dev_put(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_dev: dev_put(dev); return rc; } int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *dev; int idx; int s_idx = cb->args[0]; pr_debug("%s\n", __func__); idx = 0; for_each_netdev(net, dev) { if (idx < s_idx || dev->type != ARPHRD_IEEE802154) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0) break; cont: idx++; } cb->args[0] = idx; return skb->len; } int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev = NULL; struct ieee802154_mlme_ops *ops; struct ieee802154_mac_params params; struct wpan_phy *phy; int rc = -EINVAL; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; ops = ieee802154_mlme_ops(dev); if (!ops->get_mac_params || !ops->set_mac_params) { rc = -EOPNOTSUPP; goto out; } if (netif_running(dev)) { rc = -EBUSY; goto out; } if (!info->attrs[IEEE802154_ATTR_LBT_ENABLED] && !info->attrs[IEEE802154_ATTR_CCA_MODE] && !info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL] && !info->attrs[IEEE802154_ATTR_CSMA_RETRIES] && !info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] && !info->attrs[IEEE802154_ATTR_CSMA_MAX_BE] && !info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) goto out; phy = dev->ieee802154_ptr->wpan_phy; get_device(&phy->dev); rtnl_lock(); ops->get_mac_params(dev, &params); if (info->attrs[IEEE802154_ATTR_TXPOWER]) params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100; if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); if (info->attrs[IEEE802154_ATTR_CCA_MODE]) params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100; if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); if (info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]) params.min_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]); if (info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]) params.max_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]); if (info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) params.frame_retries = nla_get_s8(info->attrs[IEEE802154_ATTR_FRAME_RETRIES]); rc = ops->set_mac_params(dev, &params); rtnl_unlock(); wpan_phy_put(phy); dev_put(dev); return 0; out: dev_put(dev); return rc; } static int ieee802154_llsec_parse_key_id(struct genl_info *info, struct ieee802154_llsec_key_id *desc) { memset(desc, 0, sizeof(*desc)); if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) return -EINVAL; desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]); if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { if (!info->attrs[IEEE802154_ATTR_PAN_ID]) return -EINVAL; desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) { desc->device_addr.mode = IEEE802154_ADDR_SHORT; desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); } else { if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) return -EINVAL; desc->device_addr.mode = IEEE802154_ADDR_LONG; desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); } } if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]) return -EINVAL; if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]) return -EINVAL; if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]) return -EINVAL; if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT) desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]); switch (desc->mode) { case IEEE802154_SCF_KEY_SHORT_INDEX: { u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]); desc->short_source = cpu_to_le32(source); break; } case IEEE802154_SCF_KEY_HW_INDEX: desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]); break; } return 0; } static int ieee802154_llsec_fill_key_id(struct sk_buff *msg, const struct ieee802154_llsec_key_id *desc) { if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode)) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->device_addr.pan_id)) return -EMSGSIZE; if (desc->device_addr.mode == IEEE802154_ADDR_SHORT && nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, desc->device_addr.short_addr)) return -EMSGSIZE; if (desc->device_addr.mode == IEEE802154_ADDR_LONG && nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->device_addr.extended_addr, IEEE802154_ATTR_PAD)) return -EMSGSIZE; } if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id)) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT, le32_to_cpu(desc->short_source))) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED, desc->extended_source, IEEE802154_ATTR_PAD)) return -EMSGSIZE; return 0; } int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; struct ieee802154_mlme_ops *ops; void *hdr; struct ieee802154_llsec_params params; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; ops = ieee802154_mlme_ops(dev); if (!ops->llsec) { rc = -EOPNOTSUPP; goto out_dev; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0, IEEE802154_LLSEC_GETPARAMS); if (!hdr) goto out_free; rc = ops->llsec->get_params(dev, &params); if (rc < 0) goto out_free; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, be32_to_cpu(params.frame_counter)) || ieee802154_llsec_fill_key_id(msg, &params.out_key)) { rc = -ENOBUFS; goto out_free; } dev_put(dev); return ieee802154_nl_reply(msg, info); out_free: nlmsg_free(msg); out_dev: dev_put(dev); return rc; } int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev = NULL; int rc = -EINVAL; struct ieee802154_mlme_ops *ops; struct ieee802154_llsec_params params; int changed = 0; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] && !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) goto out; ops = ieee802154_mlme_ops(dev); if (!ops->llsec) { rc = -EOPNOTSUPP; goto out; } if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] && nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7) goto out; if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) { params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]); changed |= IEEE802154_LLSEC_PARAM_ENABLED; } if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) { if (ieee802154_llsec_parse_key_id(info, &params.out_key)) goto out; changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; } if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) { params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]); changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; } if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) { u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); params.frame_counter = cpu_to_be32(fc); changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; } rc = ops->llsec->set_params(dev, &params, changed); dev_put(dev); return rc; out: dev_put(dev); return rc; } struct llsec_dump_data { struct sk_buff *skb; int s_idx, s_idx2; int portid; int nlmsg_seq; struct net_device *dev; struct ieee802154_mlme_ops *ops; struct ieee802154_llsec_table *table; }; static int ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, int (*step)(struct llsec_dump_data *)) { struct net *net = sock_net(skb->sk); struct net_device *dev; struct llsec_dump_data data; int idx = 0; int first_dev = cb->args[0]; int rc; for_each_netdev(net, dev) { if (idx < first_dev || dev->type != ARPHRD_IEEE802154) goto skip; data.ops = ieee802154_mlme_ops(dev); if (!data.ops->llsec) goto skip; data.skb = skb; data.s_idx = cb->args[1]; data.s_idx2 = cb->args[2]; data.dev = dev; data.portid = NETLINK_CB(cb->skb).portid; data.nlmsg_seq = cb->nlh->nlmsg_seq; data.ops->llsec->lock_table(dev); data.ops->llsec->get_table(data.dev, &data.table); rc = step(&data); data.ops->llsec->unlock_table(dev); if (rc < 0) break; skip: idx++; } cb->args[0] = idx; return skb->len; } static int ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info, int (*fn)(struct net_device*, struct genl_info*)) { struct net_device *dev = NULL; int rc = -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->llsec) rc = -EOPNOTSUPP; else rc = fn(dev, info); dev_put(dev); return rc; } static int ieee802154_llsec_parse_key(struct genl_info *info, struct ieee802154_llsec_key *key) { u8 frames; u32 commands[256 / 32]; memset(key, 0, sizeof(*key)); if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] || !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES]) return -EINVAL; frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]); if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) return -EINVAL; if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) { nla_memcpy(commands, info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS], 256 / 8); if (commands[0] || commands[1] || commands[2] || commands[3] || commands[4] || commands[5] || commands[6] || commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1)) return -EINVAL; key->cmd_frame_ids = commands[7]; } key->frame_types = frames; nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES], IEEE802154_LLSEC_KEY_SIZE); return 0; } static int llsec_add_key(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_key key; struct ieee802154_llsec_key_id id; if (ieee802154_llsec_parse_key(info, &key) || ieee802154_llsec_parse_key_id(info, &id)) return -EINVAL; return ops->llsec->add_key(dev, &id, &key); } int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_key); } static int llsec_remove_key(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_key_id id; if (ieee802154_llsec_parse_key_id(info, &id)) return -EINVAL; return ops->llsec->del_key(dev, &id); } int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_remove_key); } static int ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_key_entry *key, const struct net_device *dev) { void *hdr; u32 commands[256 / 32]; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_KEY); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || ieee802154_llsec_fill_key_id(msg, &key->id) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES, key->key->frame_types)) goto nla_put_failure; if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) { memset(commands, 0, sizeof(commands)); commands[7] = key->key->cmd_frame_ids; if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS, sizeof(commands), commands)) goto nla_put_failure; } if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES, IEEE802154_LLSEC_KEY_SIZE, key->key->key)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_keys(struct llsec_dump_data *data) { struct ieee802154_llsec_key_entry *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->keys, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_key(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys); } static int llsec_parse_dev(struct genl_info *info, struct ieee802154_llsec_device *dev) { memset(dev, 0, sizeof(*dev)); if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || !info->attrs[IEEE802154_ATTR_HW_ADDR] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] || (!!info->attrs[IEEE802154_ATTR_PAN_ID] != !!info->attrs[IEEE802154_ATTR_SHORT_ADDR])) return -EINVAL; if (info->attrs[IEEE802154_ATTR_PAN_ID]) { dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); } else { dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF); } dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]); if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX) return -EINVAL; return 0; } static int llsec_add_dev(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device desc; if (llsec_parse_dev(info, &desc)) return -EINVAL; return ops->llsec->add_dev(dev, &desc); } int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_dev); } static int llsec_del_dev(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); return ops->llsec->del_dev(dev, devaddr); } int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_dev); } static int ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_device *desc, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_DEV); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) || nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, desc->short_addr) || nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr, IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, desc->frame_counter) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, desc->seclevel_exempt) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_devs(struct llsec_dump_data *data) { struct ieee802154_llsec_device *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->devices, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_dev(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs); } static int llsec_add_devkey(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device_key key; __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || !info->attrs[IEEE802154_ATTR_HW_ADDR] || ieee802154_llsec_parse_key_id(info, &key.key_id)) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); return ops->llsec->add_devkey(dev, devaddr, &key); } int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey); } static int llsec_del_devkey(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device_key key; __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_HW_ADDR] || ieee802154_llsec_parse_key_id(info, &key.key_id)) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); return ops->llsec->del_devkey(dev, devaddr, &key); } int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey); } static int ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq, __le64 devaddr, const struct ieee802154_llsec_device_key *devkey, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_DEVKEY); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr, IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, devkey->frame_counter) || ieee802154_llsec_fill_key_id(msg, &devkey->key_id)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_devkeys(struct llsec_dump_data *data) { struct ieee802154_llsec_device *dpos; struct ieee802154_llsec_device_key *kpos; int idx = 0, idx2; list_for_each_entry(dpos, &data->table->devices, list) { if (idx++ < data->s_idx) continue; idx2 = 0; list_for_each_entry(kpos, &dpos->keys, list) { if (idx2++ < data->s_idx2) continue; if (ieee802154_nl_fill_devkey(data->skb, data->portid, data->nlmsg_seq, dpos->hwaddr, kpos, data->dev)) { return -EMSGSIZE; } data->s_idx2++; } data->s_idx++; } return 0; } int ieee802154_llsec_dump_devkeys(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys); } static int llsec_parse_seclevel(struct genl_info *info, struct ieee802154_llsec_seclevel *sl) { memset(sl, 0, sizeof(*sl)); if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] || !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]) return -EINVAL; sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]); if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) { if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]) return -EINVAL; sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]); } sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]); sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); return 0; } static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_seclevel sl; if (llsec_parse_seclevel(info, &sl)) return -EINVAL; return ops->llsec->add_seclevel(dev, &sl); } int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel); } static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_seclevel sl; if (llsec_parse_seclevel(info, &sl)) return -EINVAL; return ops->llsec->del_seclevel(dev, &sl); } int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel); } static int ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_seclevel *sl, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_SECLEVEL); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, sl->device_override)) goto nla_put_failure; if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD && nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID, sl->cmd_frame_id)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_seclevels(struct llsec_dump_data *data) { struct ieee802154_llsec_seclevel *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->security_levels, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_seclevel(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_seclevels(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels); }
3 6 6 1 4 5 5 2 4 9 9 7 5 5 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for the AVX assembler implementation of the Cast5 Cipher * * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> */ #include <crypto/algapi.h> #include <crypto/cast5.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> #include "ecb_cbc_helpers.h" #define CAST5_PARALLEL_BLOCKS 16 asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src); static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return cast5_setkey(&tfm->base, key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way); ECB_BLOCK(1, __cast5_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way); ECB_BLOCK(1, __cast5_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__cast5_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS); CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way); CBC_DEC_BLOCK(1, __cast5_decrypt); CBC_WALK_END(); } static struct skcipher_alg cast5_algs[] = { { .base.cra_name = "ecb(cast5)", .base.cra_driver_name = "ecb-cast5-avx", .base.cra_priority = 200, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST5_MIN_KEY_SIZE, .max_keysize = CAST5_MAX_KEY_SIZE, .setkey = cast5_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(cast5)", .base.cra_driver_name = "cbc-cast5-avx", .base.cra_priority = 200, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAST5_MIN_KEY_SIZE, .max_keysize = CAST5_MAX_KEY_SIZE, .ivsize = CAST5_BLOCK_SIZE, .setkey = cast5_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static int __init cast5_init(void) { const char *feature_name; if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return crypto_register_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } static void __exit cast5_exit(void) { crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } module_init(cast5_init); module_exit(cast5_exit); MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("cast5");
2403 2400 5 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/ip.h> #include <linux/in.h> #include <linux/jhash.h> #include <net/arp.h> #include <net/addrconf.h> #include <net/netdev_lock.h> #include <net/pkt_sched.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static int ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static int ipoib_set_mac(struct net_device *dev, void *addr); static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int ipoib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *ni = ptr; struct net_device *dev = ni->dev; if (dev->netdev_ops->ndo_open != ipoib_open) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: ipoib_create_debug_files(dev); break; case NETDEV_CHANGENAME: ipoib_delete_debug_files(dev); ipoib_create_debug_files(dev); break; case NETDEV_UNREGISTER: ipoib_delete_debug_files(dev); break; } return NOTIFY_DONE; } #endif struct ipoib_ifupdown_work { struct work_struct work; struct net_device *dev; netdevice_tracker dev_tracker; bool up; }; static void ipoib_ifupdown_task(struct work_struct *work) { struct ipoib_ifupdown_work *pwork = container_of(work, struct ipoib_ifupdown_work, work); struct net_device *dev = pwork->dev; unsigned int flags; rtnl_lock(); flags = dev->flags; if (pwork->up) flags |= IFF_UP; else flags &= ~IFF_UP; if (dev->flags != flags) dev_change_flags(dev, flags, NULL); rtnl_unlock(); netdev_put(dev, &pwork->dev_tracker); kfree(pwork); } static void ipoib_schedule_ifupdown_task(struct net_device *dev, bool up) { struct ipoib_ifupdown_work *work; if ((up && (dev->flags & IFF_UP)) || (!up && !(dev->flags & IFF_UP))) return; work = kmalloc_obj(*work); if (!work) return; work->dev = dev; netdev_hold(dev, &work->dev_tracker, GFP_KERNEL); work->up = up; INIT_WORK(&work->work, ipoib_ifupdown_task); queue_work(ipoib_workqueue, &work->work); } int ipoib_open(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "bringing up interface\n"); netif_carrier_off(dev); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; } ipoib_ib_dev_up(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ netdev_lock_ops_to_full(dev); list_for_each_entry(cpriv, &priv->child_intfs, list) ipoib_schedule_ifupdown_task(cpriv->dev, true); netdev_unlock_full_to_ops(dev); } else if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", ppriv->dev->name); } netif_start_queue(dev); return 0; err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static int ipoib_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); netif_stop_queue(dev); ipoib_ib_dev_down(dev); ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ netdev_lock_ops_to_full(dev); list_for_each_entry(cpriv, &priv->child_intfs, list) ipoib_schedule_ifupdown_task(cpriv->dev, false); netdev_unlock_full_to_ops(dev); } return 0; } static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); return features; } static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = 0; /* dev->mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(dev)) { if (new_mtu > ipoib_cm_max_mtu(dev)) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); WRITE_ONCE(dev->mtu, new_mtu); return 0; } if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; if (priv->mcast_mtu < priv->admin_mtu) ipoib_dbg(priv, "MTU must be smaller than the underlying " "link layer MTU - 4 (%u)\n", priv->mcast_mtu); new_mtu = min(priv->mcast_mtu, priv->admin_mtu); if (priv->rn_ops->ndo_change_mtu) { bool carrier_status = netif_carrier_ok(dev); netif_carrier_off(dev); /* notify lower level on the real mtu */ ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); if (carrier_status) netif_carrier_on(dev); } else { WRITE_ONCE(dev->mtu, new_mtu); } return ret; } static void ipoib_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (priv->rn_ops->ndo_get_stats64) priv->rn_ops->ndo_get_stats64(dev, stats); else netdev_stats_to_stats64(stats, &dev->stats); } /* Called with an RCU read lock taken */ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, struct net_device *dev) { struct net *net = dev_net(dev); struct in_device *in_dev; struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; __be32 ret_addr; switch (addr->sa_family) { case AF_INET: in_dev = in_dev_get(dev); if (!in_dev) return false; ret_addr = inet_confirm_addr(net, in_dev, 0, addr_in->sin_addr.s_addr, RT_SCOPE_HOST); in_dev_put(in_dev); if (ret_addr) return true; break; case AF_INET6: if (IS_ENABLED(CONFIG_IPV6) && ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) return true; break; } return false; } /* * Find the L2 master net_device on top of the given net_device. * @dev: base IPoIB net_device * * Returns the L2 master net_device with reference held if the L2 master * exists (such as bond netdevice), or returns same netdev with reference * held when master does not exist or when L3 master (such as VRF netdev). */ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (!master || netif_is_l3_master(master)) master = dev; dev_hold(master); rcu_read_unlock(); return master; } struct ipoib_walk_data { const struct sockaddr *addr; struct net_device *result; }; static int ipoib_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { dev_hold(upper); data->result = upper; ret = 1; } return ret; } /** * ipoib_get_net_dev_match_addr - Find a net_device matching * the given address, which is an upper device of the given net_device. * * @addr: IP address to look for. * @dev: base IPoIB net_device * * If found, returns the net_device with a reference held. Otherwise return * NULL. */ static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); data.result = dev; goto out; } netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; } /* returns the number of IPoIB netdevs on top a given ipoib device matching a * pkey_index and address, if one exists. * * @found_net_dev: contains a matching net_device if the return value >= 1, * with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, int nesting, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; struct net_device *net_dev = NULL; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (!addr) { net_dev = ipoib_get_master_net_dev(priv->dev); } else { /* Verify the net_device matches the IP address, as * IPoIB child devices currently share a GID. */ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); } if (net_dev) { if (!*found_net_dev) *found_net_dev = net_dev; else dev_put(net_dev); ++matches; } } if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return matches; /* Check child interfaces */ netdev_lock(priv->dev); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, nesting + 1, found_net_dev); if (matches > 1) break; } netdev_unlock(priv->dev); return matches; } /* Returns the number of matching net_devs found (between 0 and 2). Also * return the matching net_device in the @net_dev parameter, holding a * reference to the net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, 0, net_dev); if (matches > 1) break; } return matches; } static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; /* See if we can find a unique device matching the pkey and GID */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with pkey and GID only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); fallthrough; case 1: return net_dev; } } int ipoib_set_mode(struct net_device *dev, const char *buf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "connected\n")) || (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "datagram\n"))) { return 0; } /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); netdev_lock_ops(dev); netdev_update_features(dev); netif_set_mtu(dev, ipoib_cm_max_mtu(dev)); netif_set_real_num_tx_queues(dev, 1); netdev_unlock_ops(dev); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); netdev_lock_ops(dev); netdev_update_features(dev); netif_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); netif_set_real_num_tx_queues(dev, dev->num_tx_queues); netdev_unlock_ops(dev); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } return -EINVAL; } struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } static void path_free(struct net_device *dev, struct ipoib_path *path) { struct sk_buff *skb; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); /* remove all neigh connected to this path */ ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; iter = kmalloc_obj(*iter); if (!iter) return NULL; iter->dev = dev; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec)), path->pathrec.dgid.raw); if (path->ah) path->ah->valid = 0; } spin_unlock_irq(&priv->lock); } static void push_pseudo_header(struct sk_buff *skb, const char *daddr) { struct ipoib_pseudo_header *phdr; phdr = skb_push(skb, sizeof(*phdr)); memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); } void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); wait_for_completion(&path->done); path_free(dev, path); netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); } static void path_rec_completion(int status, struct sa_path_rec *pathrec, unsigned int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->dgid.raw); else ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", status, path->pathrec.dgid.raw); skb_queue_head_init(&skqueue); if (!status) { struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); rdma_destroy_ah_attr(&av); } } spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { /* * pathrec.dgid is used as the database key from the LLADDR, * it must remain unchanged even if the SA returns a different * GID to use in the AH. */ if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid))) { ipoib_dbg( priv, "%s got PathRec for gid %pI6 while asked for %pI6\n", dev->name, pathrec->dgid.raw, path->pathrec.dgid.raw); memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid)); } path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { if (neigh->ah) { WARN_ON(neigh->ah != old_ah); /* * Dropping the ah reference inside * priv->lock is safe here, because we * will hold one more reference from * the original value of path->ah (ie * old_ah). */ ipoib_put_ah(neigh->ah); } kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); continue; } } while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } path->ah->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (IS_ERR_OR_NULL(ah)) ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (old_ah) ipoib_put_ah(old_ah); while ((skb = __skb_dequeue(&skqueue))) { int ret; skb->dev = dev; ret = dev_queue_xmit(skb); if (ret) ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", __func__, ret); } } static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, void *gid) { path->dev = priv->dev; if (rdma_cap_opa_ah(priv->ca, priv->port)) path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; else path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; } static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc_obj(*path, GFP_ATOMIC); if (!path) return NULL; skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); init_path_rec(priv, path, gid); return path; } static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "Start path record lookup for %pI6\n", path->pathrec.dgid.raw); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); path = __path_find(dev, daddr + 4); if (!path) goto out; if (!path->query) path_rec_start(dev, path); out: spin_unlock_irqrestore(&priv->lock, flags); } static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NULL; } /* To avoid race condition, make sure that the * neigh will be added only once. */ if (unlikely(!list_empty(&neigh->list))) { spin_unlock_irqrestore(&priv->lock, flags); return neigh; } path = __path_find(dev, daddr + 4); if (!path) { path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; __path_add(dev, path); } list_add_tail(&neigh->list, &path->neigh_list); if (path->ah && path->ah->valid) { kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", skb_queue_len(&neigh->queue)); goto err_drop; } } else { spin_unlock_irqrestore(&priv->lock, flags); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); return NULL; } } else { neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) goto err_path; if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { goto err_drop; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; err_path: ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, struct ipoib_pseudo_header *phdr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); /* no broadcast means that all paths are (going to be) not valid */ if (!priv->broadcast) goto drop_and_unlock; path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->ah || !path->ah->valid) { if (!path) { path = path_rec_create(dev, phdr->hwaddr + 4); if (!path) goto drop_and_unlock; __path_add(dev, path); } else { /* * make sure there are no changes in the existing * path record */ init_path_rec(priv, path, phdr->hwaddr + 4); } if (!path->query && path_rec_start(dev, path)) { goto drop_and_unlock; } if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); goto unlock; } else { goto drop_and_unlock; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_dbg(priv, "Send unicast ARP to %08x\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec))); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(phdr->hwaddr)); return; drop_and_unlock: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); unlock: spin_unlock_irqrestore(&priv->lock, flags); } static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr)); header = (struct ipoib_header *) skb->data; if (unlikely(phdr->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && (header->proto != htons(ETH_P_RARP)) && (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff; neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (likely(neigh)) goto send_using_neigh; ipoib_mcast_send(dev, phdr->hwaddr, skb); return NETDEV_TX_OK; } /* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { neigh = neigh_add_path(skb, phdr->hwaddr, dev); if (likely(!neigh)) return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; } else if (neigh->ah) { neigh_refresh_path(neigh, phdr->hwaddr, dev); } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } unref: ipoib_neigh_put(neigh); return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); if (rn->tx_timeout) { rn->tx_timeout(dev, txqueue); return; } ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev_trans_start(dev))); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, priv->global_tx_head, priv->global_tx_tail); schedule_work(&priv->tx_timeout_work); } void ipoib_ib_tx_timeout_work(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, tx_timeout_work); int err; rtnl_lock(); netdev_lock_ops(priv->dev); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) goto unlock; ipoib_stop(priv->dev); err = ipoib_open(priv->dev); if (err) { ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", err); goto unlock; } netif_tx_wake_all_queues(priv->dev); unlock: netdev_unlock_ops(priv->dev); rtnl_unlock(); } static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ipoib_header *header; header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; /* * we don't rely on dst_entry structure, always stuff the * destination address into skb hard header so we can figure out where * to send the packet later. */ push_pseudo_header(skb, daddr); return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); return; } queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return READ_ONCE(dev->ifindex); /* child/vlan interface */ return READ_ONCE(priv->parent->ifindex); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ u32 *d32 = (u32 *) daddr; u32 hv; hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; u32 hash_val; rcu_read_lock_bh(); htbl = rcu_dereference_bh(ntbl->htbl); if (!htbl) goto out_unlock; hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); neigh != NULL; neigh = rcu_dereference_bh(neigh->hnext)) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; goto out_unlock; } if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) neigh->alive = jiffies; goto out_unlock; } } out_unlock: rcu_read_unlock_bh(); return neigh; } static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long neigh_obsolete; unsigned long dt; unsigned long flags; int i; LIST_HEAD(remove_list); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); ipoib_mcast_remove_list(&remove_list); } static void ipoib_reap_neigh(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); __ipoib_reap_neigh(priv); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; neigh = kzalloc_obj(*neigh, GFP_ATOMIC); if (!neigh) return NULL; neigh->dev = dev; memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); /* one ref on behalf of the caller */ refcount_set(&neigh->refcnt, 1); return neigh; } struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; } /* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; break; } neigh->alive = jiffies; goto out_unlock; } } neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock; /* one ref on behalf of the hash table */ refcount_inc(&neigh->refcnt); neigh->alive = jiffies; /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: return neigh; } void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) complete(&priv->ntbl.flushed); } } static void ipoib_neigh_reclaim(struct rcu_head *rp) { /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */ ipoib_neigh_put(neigh); } void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; struct ipoib_neigh *n; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock)); n != NULL; n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { np = &n->hnext; } } } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **buckets; u32 size; clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; htbl = kzalloc_obj(*htbl); if (!htbl) return -ENOMEM; size = roundup_pow_of_two(arp_tbl.gc_thresh3); buckets = kvzalloc_objs(*buckets, size); if (!buckets) { kfree(htbl); return -ENOMEM; } htbl->size = size; htbl->mask = (size - 1); htbl->buckets = buckets; RCU_INIT_POINTER(ntbl->htbl, htbl); htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct ipoib_neigh_hash *htbl = container_of(head, struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i; /* remove all neigh connected to a given path or mcast */ spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; wait_flushed = atomic_read(&priv->ntbl.entries); if (!wait_flushed) goto free_htbl; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: spin_unlock_irqrestore(&priv->lock, flags); if (wait_flushed) wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "%s\n", __func__); init_completion(&priv->ntbl.deleted); cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); wait_for_completion(&priv->ntbl.deleted); } static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_del(&priv->recv_napi); netif_napi_del(&priv->send_napi); } static void ipoib_dev_uninit_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_transport_dev_cleanup(dev); ipoib_napi_del(dev); ipoib_cm_dev_cleanup(dev); kfree(priv->rx_ring); vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static int ipoib_dev_init_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); u8 addr_mod[3]; ipoib_napi_add(dev); /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc_objs(*priv->rx_ring, ipoib_recvq_size); if (!priv->rx_ring) goto out; priv->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*priv->tx_ring))); if (!priv->tx_ring) { pr_warn("%s: failed to allocate TX ring (%d entries)\n", priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ if (ipoib_transport_dev_init(dev, priv->ca)) { pr_warn("%s: ipoib_transport_dev_init failed\n", priv->ca->name); goto out_tx_ring_cleanup; } /* after qp created set dev address */ addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; addr_mod[2] = (priv->qp->qp_num) & 0xff; dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); return 0; out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: ipoib_napi_del(dev); return -ENOMEM; } static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_eth_ioctl) return -EOPNOTSUPP; return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); } static int ipoib_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *config) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_hwtstamp_get) return -EOPNOTSUPP; return priv->rn_ops->ndo_hwtstamp_get(dev, config); } static int ipoib_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_hwtstamp_set) return -EOPNOTSUPP; return priv->rn_ops->ndo_hwtstamp_set(dev, config, extack); } static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; priv->qp = NULL; /* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue */ priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); if (!priv->wq) { pr_warn("%s: failed to allocate device WQ\n", dev->name); goto out; } /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } ret = priv->rn_ops->ndo_init(dev); if (ret) { pr_warn("%s failed to init HW resource\n", dev->name); goto out_free_pd; } ret = ipoib_neigh_hash_init(priv); if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; goto out_hash_uninit; } } return 0; out_hash_uninit: ipoib_neigh_hash_uninit(dev); out_dev_uninit: ipoib_ib_dev_cleanup(dev); out_free_pd: if (priv->pd) { ib_dealloc_pd(priv->pd); priv->pd = NULL; } clean_wq: if (priv->wq) { destroy_workqueue(priv->wq); priv->wq = NULL; } out: return ret; } /* * This must be called before doing an unregister_netdev on a parent device to * shutdown the IB event handler. */ static void ipoib_parent_unregister_pre(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); /* * ipoib_set_mac checks netif_running before pushing work, clearing * running ensures the it will not add more work. */ rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); rtnl_unlock(); /* ipoib_event() cannot be running once this returns */ ib_unregister_event_handler(&priv->event_handler); /* * Work on the queue grabs the rtnl lock, so this cannot be done while * also holding it. */ flush_workqueue(ipoib_workqueue); } static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) { priv->hca_caps = priv->ca->attrs.device_cap_flags; priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->kernel_caps & IBK_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } } static int ipoib_parent_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ib_port_attr attr; int result; result = ib_query_port(priv->ca, priv->port, &attr); if (result) { pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, priv->port); return result; } priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) { pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); if (result) { pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); priv->dev->dev_port = priv->port - 1; /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; } static void ipoib_child_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); priv->max_ib_mtu = ppriv->max_ib_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) memcpy(&priv->local_gid, priv->dev->dev_addr + 4, sizeof(priv->local_gid)); else { __dev_addr_set(priv->dev, ppriv->dev->dev_addr, INFINIBAND_ALEN); memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); } } static int ipoib_ndo_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); int rc; struct rdma_netdev *rn = netdev_priv(ndev); if (priv->parent) { ipoib_child_init(ndev); } else { rc = ipoib_parent_init(ndev); if (rc) return rc; } /* MTU will be reset when mcast join happens */ ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = ndev->mtu; rn->mtu = priv->mcast_mtu; ndev->max_mtu = IPOIB_CM_MTU; ndev->neigh_priv_len = sizeof(struct ipoib_neigh); /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; ndev->broadcast[8] = priv->pkey >> 8; ndev->broadcast[9] = priv->pkey & 0xff; set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); ipoib_set_dev_features(priv); rc = ipoib_dev_init(ndev); if (rc) { pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", priv->ca->name, priv->dev->name, priv->port, rc); return rc; } if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); dev_hold(priv->parent); netdev_lock(priv->parent); list_add_tail(&priv->list, &ppriv->child_intfs); netdev_unlock(priv->parent); } return 0; } static void ipoib_ndo_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); /* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed. */ WARN_ON(!list_empty(&priv->child_intfs)); if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); netdev_lock(ppriv->dev); list_del(&priv->list); netdev_unlock(ppriv->dev); } ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); /* no more works over the priv->wq */ if (priv->wq) { /* See ipoib_mcast_carrier_on_task() */ WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); destroy_workqueue(priv->wq); priv->wq = NULL; } dev_put(priv->parent); } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); } static int ipoib_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int err; err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); if (err) return err; ivf->vf = vf; memcpy(ivf->mac, dev->dev_addr, dev->addr_len); return 0; } static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) return -EINVAL; return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); } static int ipoib_get_vf_guid(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); } static int ipoib_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_set_vf_link_state = ipoib_set_vf_link_state, .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_get_vf_guid = ipoib_get_vf_guid, .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, .ndo_hwtstamp_get = ipoib_hwtstamp_get, .ndo_hwtstamp_set = ipoib_hwtstamp_set, }; static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, .ndo_hwtstamp_get = ipoib_hwtstamp_get, .ndo_hwtstamp_set = ipoib_hwtstamp_set, }; static const struct net_device_ops ipoib_netdev_default_pf = { .ndo_init = ipoib_dev_init_default, .ndo_uninit = ipoib_dev_uninit_default, .ndo_open = ipoib_ib_dev_open_default, .ndo_stop = ipoib_ib_dev_stop_default, }; void ipoib_setup_common(struct net_device *dev) { dev->header_ops = &ipoib_header_ops; dev->netdev_ops = &ipoib_netdev_default_pf; ipoib_set_ethtool_ops(dev); dev->watchdog_timeo = 10 * HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); /* * unregister_netdev always frees the netdev, we use this mode * consistently to unify all the various unregister paths, including * those connected to rtnl_link_ops which require it. */ dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); priv->dev = dev; spin_lock_init(&priv->lock); mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) return dev; dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!dev) return ERR_PTR(-ENOMEM); return dev; } int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, struct net_device *dev) { struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_dev_priv *priv; int rc; priv = kzalloc_obj(*priv); if (!priv) return -ENOMEM; priv->ca = hca; priv->port = port; rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common, dev); if (rc) { if (rc != -EOPNOTSUPP) goto out; rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; rc = netif_set_real_num_tx_queues(dev, 1); if (rc) goto out; rc = netif_set_real_num_rx_queues(dev, 1); if (rc) goto out; } priv->rn_ops = dev->netdev_ops; if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) dev->netdev_ops = &ipoib_netdev_ops_vf; else dev->netdev_ops = &ipoib_netdev_ops_pf; rn->clnt_priv = priv; /* * Only the child register_netdev flows can handle priv_destructor * being set, so we force it to NULL here and handle manually until it * is safe to turn on. */ priv->next_priv_destructor = dev->priv_destructor; dev->priv_destructor = NULL; ipoib_build_priv(dev); return 0; out: kfree(priv); return rc; } struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; int rc; dev = ipoib_alloc_netdev(hca, port, name); if (IS_ERR(dev)) return dev; rc = ipoib_intf_init(hca, port, name, dev); if (rc) { free_netdev(dev); return ERR_PTR(rc); } /* * Upon success the caller must ensure ipoib_intf_free is called or * register_netdevice succeed'd and priv_destructor is set to * ipoib_intf_free. */ return dev; } void ipoib_intf_free(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); dev->priv_destructor = priv->next_priv_destructor; if (dev->priv_destructor) dev->priv_destructor(dev); /* * There are some error flows around register_netdev failing that may * attempt to call priv_destructor twice, prevent that from happening. */ dev->priv_destructor = NULL; /* unregister/destroy is very complicated. Make bugs more obvious. */ rn->clnt_priv = NULL; kfree(priv); } static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "0x%04x\n", priv->pkey); } static DEVICE_ATTR_RO(pkey); static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); ipoib_warn(priv, "ignoring multicast groups joined directly " "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); } static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long umcast_val = simple_strtoul(buf, NULL, 0); ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } static DEVICE_ATTR_RW(umcast); int ipoib_add_umcast_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_umcast); } static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) { struct ipoib_dev_priv *child_priv; struct net_device *netdev = priv->dev; netif_addr_lock_bh(netdev); memcpy(&priv->local_gid.global.interface_id, &gid->global.interface_id, sizeof(gid->global.interface_id)); dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); netif_addr_unlock_bh(netdev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { netdev_lock_ops_to_full(priv->dev); list_for_each_entry(child_priv, &priv->child_intfs, list) set_base_guid(child_priv, gid); netdev_unlock_full_to_ops(priv->dev); } } static int ipoib_check_lladdr(struct net_device *dev, struct sockaddr_storage *ss) { union ib_gid *gid = (union ib_gid *)(ss->__data + 4); int ret = 0; netif_addr_lock_bh(dev); /* Make sure the QPN, reserved and subnet prefix match the current * lladdr, it also makes sure the lladdr is unicast. */ if (memcmp(dev->dev_addr, ss->__data, 4 + sizeof(gid->global.subnet_prefix)) || gid->global.interface_id == 0) ret = -EINVAL; netif_addr_unlock_bh(dev); return ret; } static int ipoib_set_mac(struct net_device *dev, void *addr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sockaddr_storage *ss = addr; int ret; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; ret = ipoib_check_lladdr(dev, ss); if (ret) return ret; set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; netdev_lock_ops_to_full(dev); list_for_each_entry(cpriv, &priv->child_intfs, list) queue_work(ipoib_workqueue, &cpriv->flush_light); netdev_unlock_full_to_ops(dev); } queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static ssize_t create_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(create_child); static ssize_t delete_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey < 0 || pkey > 0xffff) return -EINVAL; ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } /* * We erroneously exposed the iface's port number in the dev_id * sysfs field long after dev_port was introduced for that purpose[1], * and we need to stop everyone from relying on that. * Let's overload the shower routine for the dev_id file here * to gently bring the issue up. * * [1] https://www.spinics.net/lists/netdev/msg272123.html */ static ssize_t dev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); /* * ndev->dev_port will be equal to 0 in old kernel prior to commit * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface * port numbers") Zero was chosen as special case for user space * applications to fallback and query dev_id to check if it has * different value or not. * * Don't print warning in such scenario. * * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 */ if (ndev->dev_port && ndev->dev_id == ndev->dev_port) netdev_info_once(ndev, "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", current->comm); return sysfs_emit(buf, "%#x\n", ndev->dev_id); } static DEVICE_ATTR_RO(dev_id); static int ipoib_intercept_dev_id_attr(struct net_device *dev) { device_remove_file(&dev->dev, &dev_attr_dev_id); return device_create_file(&dev->dev, &dev_attr_dev_id); } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u32 port) { struct rtnl_link_ops *ops = ipoib_get_link_ops(); struct rdma_netdev_alloc_params params; struct ipoib_dev_priv *priv; struct net_device *ndev; int result; ndev = ipoib_intf_alloc(hca, port, format); if (IS_ERR(ndev)) { pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, PTR_ERR(ndev)); return ndev; } priv = ipoib_priv(ndev); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); ib_register_event_handler(&priv->event_handler); /* call event handler to ensure pkey in sync */ ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY); ndev->rtnl_link_ops = ipoib_get_link_ops(); dev_net_set(ndev, rdma_dev_net(hca)); result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); ipoib_parent_unregister_pre(ndev); ipoib_intf_free(ndev); free_netdev(ndev); return ERR_PTR(result); } if (hca->ops.rdma_netdev_get_params) { int rc = hca->ops.rdma_netdev_get_params(hca, port, RDMA_NETDEV_IPOIB, &params); if (!rc && ops->priv_size < params.sizeof_priv) ops->priv_size = params.sizeof_priv; } /* * We cannot set priv_destructor before register_netdev because we * need priv to be always valid during the error flow to execute * ipoib_parent_unregister_pre(). Instead handle it manually and only * enter priv_destructor mode once we are completely registered. */ ndev->priv_destructor = ipoib_intf_free; if (ipoib_intercept_dev_id_attr(ndev)) goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; return ndev; sysfs_failed: ipoib_parent_unregister_pre(ndev); unregister_netdev(ndev); return ERR_PTR(-ENOMEM); } static int ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; unsigned int p; int count = 0; dev_list = kmalloc_obj(*dev_list); if (!dev_list) return -ENOMEM; INIT_LIST_HEAD(dev_list); rdma_for_each_port (device, p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = ipoib_priv(dev); list_add_tail(&priv->list, dev_list); count++; } } if (!count) { kfree(dev_list); return -EOPNOTSUPP; } ib_set_client_data(device, &ipoib_client, dev_list); return 0; } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; list_for_each_entry_safe(priv, tmp, dev_list, list) { LIST_HEAD(head); ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); netdev_lock(priv->dev); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) unregister_netdevice_queue(cpriv->dev, &head); netdev_unlock(priv->dev); unregister_netdevice_queue(priv->dev, &head); unregister_netdevice_many(&head); rtnl_unlock(); } kfree(dev_list); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static struct notifier_block ipoib_netdev_notifier = { .notifier_call = ipoib_netdev_event, }; #endif static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* * When copying small received packets, we only copy from the * linear data part of the SKB, so we rely on this condition. */ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); ipoib_register_debugfs(); /* * We create a global workqueue here that is used for all flush * operations. However, if you attempt to flush a workqueue * from a task on that same workqueue, it deadlocks the system. * We want to be able to flush the tasks associated with a * specific net device, so we also create a workqueue for each * netdevice. We queue up the tasks for that device only on * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; ret = ipoib_netlink_init(); if (ret) goto err_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG register_netdevice_notifier(&ipoib_netdev_notifier); #endif return 0; err_client: ib_unregister_client(&ipoib_client); err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: ipoib_unregister_debugfs(); return ret; } static void __exit ipoib_cleanup_module(void) { #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG unregister_netdevice_notifier(&ipoib_netdev_notifier); #endif ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module);
277 28 262 119 62 62 62 76 13 3 13 10 72 70 34 55 55 72 88 7 14 2 13 2 1 2 14 67 76 2 70 4 10 4 2 8 67 57 18 20 13 15 7 20 55 38 9 52 15 45 12 52 51 15 38 37 2 55 173 94 1 167 2 1 6 76 127 46 167 146 21 165 3 92 58 22 91 18 91 122 55 8 8 4 35 2 179 4 11 166 116 1 4 115 116 68 114 83 25 44 3 101 8 35 90 77 46 103 104 35 22 13 1 55 50 115 104 11 1 74 7 68 39 26 227 4 1 25 7 125 2 2 125 2 123 4 3 9 62 3 58 6 122 102 101 74 2 163 2 99 130 10 220 12 116 125 124 25 5 25 25 10 10 5 15 3 6 15 3 4 28 3 25 25 74 1 2 40 32 10 4 34 24 70 247 1 1 5 5 239 240 25 16 2 44 40 2 1 2 24 1 1 2 1 15 3 17 5 17 7 15 1 14 9 1 10 8 10 36 6 4 17 2 18 10 10 44 1 1 3 3 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to * indicate they support non-blocking reads or writes, we must clear it * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); do { if (!(fmode & FMODE_NOWAIT)) break; } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); } /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); struct address_space *mapping; folio_lock(folio); mapping = folio_mapping(folio); if (mapping) { WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ folio_wait_writeback(folio); if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: folio_unlock(folio); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); int err; if (!folio_test_uptodate(folio)) { folio_lock(folio); /* * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!folio->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } /* Folio is ok after all, we are done */ folio_unlock(folio); } return 0; error: folio_unlock(folio); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; ssize_t ret = 0; int page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_objs(struct page *, max_usage); spd->partial = kmalloc_objs(struct partial_page, max_usage); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * copy_splice_read - Copy data from a file and splice the copy into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function allocates a bunch of pages sufficient to hold the requested * amount of data (but limited by the remaining pipe capacity), passes it to * the file's ->read_iter() to read into and then splices the used pages into * the pipe. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); bv = kzalloc(array_size(npages, sizeof(bv[0])) + array_size(npages, sizeof(struct page *)), GFP_KERNEL); if (!bv) return -ENOMEM; pages = (struct page **)(bv + npages); npages = alloc_pages_bulk(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; } remain = len = min_t(size_t, len, npages * PAGE_SIZE); for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); bv[i].bv_page = pages[i]; bv[i].bv_offset = 0; bv[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; } /* * Callers of ->splice_read() expect -EAGAIN on "can't put anything in * there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ remain = ret; for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); *buf = (struct pipe_buffer) { .ops = &default_pipe_buf_ops, .page = bv[i].bv_page, .offset = 0, .len = chunk, }; pipe->head++; remain -= chunk; } kfree(bv); return ret; } EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array; ssize_t ret; if (!out->f_op->write_iter) return -EINVAL; array = kzalloc_objs(struct bio_vec, nbufs); if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; unsigned int head, tail; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kzalloc_objs(struct bio_vec, nbufs); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } bvec_set_page(&array[n], buf->page, this_len, buf->offset); left -= this_len; n++; } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; WARN_ONCE(ret > sd.total_len - left, "Splice Exceeded! ret=%zd tot=%zu left=%zu\n", ret, sd.total_len, left); sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); #ifdef CONFIG_NET /** * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct socket *sock = sock_from_file(out); struct bio_vec bvec[16]; struct msghdr msg = {}; ssize_t ret = 0; size_t spliced = 0; bool need_wakeup = false; pipe_lock(pipe); while (len > 0) { unsigned int head, tail, bc = 0; size_t remain = len; /* * Check for signal early to make process killable when there * are always buffers available */ ret = -ERESTARTSYS; if (signal_pending(current)) break; while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; if (spliced) goto out; ret = -EAGAIN; if (flags & SPLICE_F_NONBLOCK) goto out; ret = -ERESTARTSYS; if (signal_pending(current)) goto out; if (need_wakeup) { wakeup_pipe_writers(pipe); need_wakeup = false; } pipe_wait_readable(pipe); } head = pipe->head; tail = pipe->tail; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { tail++; continue; } seg = min_t(size_t, remain, buf->len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; break; } bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); remain -= seg; if (remain == 0 || bc >= ARRAY_SIZE(bvec)) break; tail++; } if (!bc) break; msg.msg_flags = MSG_SPLICE_PAGES; if (flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; if (out->f_flags & O_NONBLOCK) msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); ret = sock_sendmsg(sock, &msg); if (ret <= 0) break; spliced += ret; len -= ret; tail = pipe->tail; while (ret > 0) { struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; buf->len -= seg; ret -= seg; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; } } if (tail != pipe->tail) { pipe->tail = tail; if (pipe->files) need_wakeup = true; } } out: pipe_unlock(pipe); if (need_wakeup) wakeup_pipe_writers(pipe); return spliced ?: ret; } #endif static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Indicate to the caller that there was a premature EOF when reading from the * source and the caller didn't indicate they would be sending more data after * this. */ static void do_splice_eof(struct splice_desc *sd) { if (sd->splice_eof) sd->splice_eof(sd); } /* * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ static ssize_t do_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; if (!len) return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); /* * O_DIRECT and DAX don't deal with the pagecache, so we allocate a * buffer, copy into it and splice that into the pipe. */ if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from * @ppos: Input file offset * @pipe: Pipe to splice to * @len: Number of bytes to splice * @flags: Splice modifier flags (SPLICE_F_*) * * Splice the requested amount of data from the input file to the pipe. This * is synchronous as the caller must hold the pipe lock across the entire * operation. * * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ ssize_t vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; ssize_t ret, bytes; size_t len; int i, flags, more; /* * We require the input to be seekable, as we don't want to randomly * drop data for eg socket -> socket splicing. Use the piped splicing * for that! */ if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ bytes = 0; len = sd->total_len; /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; sd->flags &= ~SPLICE_F_NONBLOCK; /* * We signal MORE until we've read sufficient data to fulfill the * request and we keep signalling it if the caller set it. */ more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; read_len = ret; sd->total_len = read_len; /* * If we now have sufficient data to fulfill the request then * we clear SPLICE_F_MORE if it was not set initially. */ if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; read_failure: /* * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a * "->splice_in()" that returned EOF (ie zero) *and* we have sent at * least 1 byte *then* we will also do the ->splice_eof() call. */ if (ret == 0 && !more && len > 0 && bytes) do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; long ret; file_start_write(file); ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); file_end_write(file); return ret; } static int splice_file_range_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) { struct file *file = sd->u.file; if (file->f_op->splice_eof) file->f_op->splice_eof(file); } static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags, splice_direct_actor *actor) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .splice_eof = direct_file_splice_eof, .opos = opos, }; ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * * Callers already called rw_verify_area() on the entire range. */ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); /** * splice_file_range - splices data between two files for copy_file_range() * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * * Description: * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, min_t(size_t, len, MAX_RW_COUNT), 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } ret = rw_verify_area(READ, in, &offset, len); if (unlikely(ret < 0)) return ret; if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; } else { ret = -EINVAL; } if (ret > 0) { /* * Generate modify out before access in: * do_splice_from() may've already sent modify out, * and this ensures the events get merged. */ fsnotify_modify(out); fsnotify_access(in); } return ret; } static ssize_t __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe) { if (off_in) return -ESPIPE; pipe_clear_nowait(in); } if (opipe) { if (off_out) return -ESPIPE; pipe_clear_nowait(out); } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static ssize_t iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; ssize_t left; size_t start; int i, n; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { ret = left; break; } n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { int size = umin(left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { iov_iter_revert(from, left); // this one got dropped by add_to_pipe() while (++i < n) put_page(pages[i]); goto out; } total += ret; left -= size; start = 0; } } out: return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; ssize_t ret = 0; if (!pipe) return -EBADF; pipe_clear_nowait(file); if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } if (ret > 0) fsnotify_access(file); return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_clear_nowait(file); pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) { wakeup_pipe_readers(pipe); fsnotify_modify(file); } return ret; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_WRITE) type = ITER_SOURCE; else if (fd_file(f)->f_mode & FMODE_READ) type = ITER_DEST; else return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) return error; if (!iov_iter_count(&iter)) error = 0; else if (type == ITER_SOURCE) error = vmsplice_to_pipe(fd_file(f), &iter, flags); else error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, in)(fd_in); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fd_out); if (fd_empty(out)) return -EBADF; return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; o_head = opipe->head; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = pipe_buf(ipipe, i_tail); obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } if (ret > 0) { fsnotify_access(in); fsnotify_modify(out); } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; CLASS(fd, in)(fdin); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fdout); if (fd_empty(out)) return -EBADF; return do_tee(fd_file(in), fd_file(out), len, flags); }
14 15 3 6 4 2 2 5 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. */ #include <linux/if_arp.h> #include <linux/init.h> #include <linux/slab.h> #include <net/x25.h> LIST_HEAD(x25_route_list); DEFINE_RWLOCK(x25_route_list_lock); /* * Add a new route. */ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits) goto out; } rt = kmalloc_obj(*rt, GFP_ATOMIC); rc = -ENOMEM; if (!rt) goto out; strcpy(rt->address.x25_addr, "000000000000000"); memcpy(rt->address.x25_addr, address->x25_addr, sigdigits); rt->sigdigits = sigdigits; rt->dev = dev; refcount_set(&rt->refcnt, 1); list_add(&rt->node, &x25_route_list); rc = 0; out: write_unlock_bh(&x25_route_list_lock); return rc; } /** * __x25_remove_route - remove route from x25_route_list * @rt: route to remove * * Remove route from x25_route_list. If it was there. * Caller must hold x25_route_list_lock. */ static void __x25_remove_route(struct x25_route *rt) { if (rt->node.next) { list_del(&rt->node); x25_route_put(rt); } } static int x25_del_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits && rt->dev == dev) { __x25_remove_route(rt); rc = 0; break; } } write_unlock_bh(&x25_route_list_lock); return rc; } /* * A device has been removed, remove its routes. */ void x25_route_device_down(struct net_device *dev) { struct x25_route *rt; struct list_head *entry, *tmp; write_lock_bh(&x25_route_list_lock); list_for_each_safe(entry, tmp, &x25_route_list) { rt = list_entry(entry, struct x25_route, node); if (rt->dev == dev) __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); } /* * Check that the device given is a valid X.25 interface that is "up". */ struct net_device *x25_dev_get(char *devname) { struct net_device *dev = dev_get_by_name(&init_net, devname); if (dev && (!(dev->flags & IFF_UP) || dev->type != ARPHRD_X25)) { dev_put(dev); dev = NULL; } return dev; } /** * x25_get_route - Find a route given an X.25 address. * @addr: - address to find a route for * * Find a route given an X.25 address. */ struct x25_route *x25_get_route(struct x25_address *addr) { struct x25_route *rt, *use = NULL; read_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, addr, rt->sigdigits)) { if (!use) use = rt; else if (rt->sigdigits > use->sigdigits) use = rt; } } if (use) x25_route_hold(use); read_unlock_bh(&x25_route_list_lock); return use; } /* * Handle the ioctls that control the routing functions. */ int x25_route_ioctl(unsigned int cmd, void __user *arg) { struct x25_route_struct rt; struct net_device *dev; int rc = -EINVAL; if (cmd != SIOCADDRT && cmd != SIOCDELRT) goto out; rc = -EFAULT; if (copy_from_user(&rt, arg, sizeof(rt))) goto out; rc = -EINVAL; if (rt.sigdigits > 15) goto out; dev = x25_dev_get(rt.device); if (!dev) goto out; if (cmd == SIOCADDRT) rc = x25_add_route(&rt.address, rt.sigdigits, dev); else rc = x25_del_route(&rt.address, rt.sigdigits, dev); dev_put(dev); out: return rc; } /* * Release all memory associated with X.25 routing structures. */ void __exit x25_route_free(void) { struct x25_route *rt; struct list_head *entry, *tmp; write_lock_bh(&x25_route_list_lock); list_for_each_safe(entry, tmp, &x25_route_list) { rt = list_entry(entry, struct x25_route, node); __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014 Felix Fietkau <nbd@nbd.name> * Copyright (C) 2004 - 2009 Ivo van Doorn <IvDoorn@gmail.com> */ #ifndef _LINUX_BITFIELD_H #define _LINUX_BITFIELD_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/typecheck.h> #include <asm/byteorder.h> /* * Bitfield access macros * * FIELD_{GET,PREP} macros take as first parameter shifted mask * from which they extract the base mask and shift amount. * Mask must be a compilation time constant. * field_{get,prep} are variants that take a non-const mask. * * Example: * * #include <linux/bitfield.h> * #include <linux/bits.h> * * #define REG_FIELD_A GENMASK(6, 0) * #define REG_FIELD_B BIT(7) * #define REG_FIELD_C GENMASK(15, 8) * #define REG_FIELD_D GENMASK(31, 16) * * Get: * a = FIELD_GET(REG_FIELD_A, reg); * b = FIELD_GET(REG_FIELD_B, reg); * * Set: * reg = FIELD_PREP(REG_FIELD_A, 1) | * FIELD_PREP(REG_FIELD_B, 0) | * FIELD_PREP(REG_FIELD_C, c) | * FIELD_PREP(REG_FIELD_D, 0x40); * * Modify: * FIELD_MODIFY(REG_FIELD_C, &reg, c); */ #define __bf_shf(x) (__builtin_ffsll(x) - 1) #define __scalar_type_to_unsigned_cases(type) \ unsigned type: (unsigned type)0, \ signed type: (unsigned type)0 #define __unsigned_scalar_typeof(x) typeof( \ _Generic((x), \ char: (unsigned char)0, \ __scalar_type_to_unsigned_cases(char), \ __scalar_type_to_unsigned_cases(short), \ __scalar_type_to_unsigned_cases(int), \ __scalar_type_to_unsigned_cases(long), \ __scalar_type_to_unsigned_cases(long long), \ default: (x))) #define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) #define __BF_FIELD_CHECK_MASK(_mask, _val, _pfx) \ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ ~((_mask) >> __bf_shf(_mask)) & \ (0 + (_val)) : 0, \ _pfx "value too large for the field"); \ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ (1ULL << __bf_shf(_mask))); \ }) #define __BF_FIELD_CHECK_REG(mask, reg, pfx) \ BUILD_BUG_ON_MSG(__bf_cast_unsigned(mask, mask) > \ __bf_cast_unsigned(reg, ~0ull), \ pfx "type of reg too small for mask") #define __BF_FIELD_CHECK(mask, reg, val, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, val, pfx); \ __BF_FIELD_CHECK_REG(mask, reg, pfx); \ }) #define __FIELD_PREP(mask, val, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, val, pfx); \ ((typeof(mask))(val) << __bf_shf(mask)) & (mask); \ }) #define __FIELD_GET(mask, reg, pfx) \ ({ \ __BF_FIELD_CHECK_MASK(mask, 0U, pfx); \ (typeof(mask))(((reg) & (mask)) >> __bf_shf(mask)); \ }) /** * FIELD_MAX() - produce the maximum value representable by a field * @_mask: shifted mask defining the field's length and position * * FIELD_MAX() returns the maximum value that can be held in the field * specified by @_mask. */ #define FIELD_MAX(_mask) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \ (typeof(_mask))((_mask) >> __bf_shf(_mask)); \ }) /** * FIELD_FIT() - check if value fits in the field * @_mask: shifted mask defining the field's length and position * @_val: value to test against the field * * Return: true if @_val can fit inside @_mask, false if @_val is too big. */ #define FIELD_FIT(_mask, _val) \ ({ \ __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ }) /** * FIELD_PREP() - prepare a bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. */ #define FIELD_PREP(_mask, _val) \ ({ \ __BF_FIELD_CHECK_REG(_mask, 0ULL, "FIELD_PREP: "); \ __FIELD_PREP(_mask, _val, "FIELD_PREP: "); \ }) #define __BF_CHECK_POW2(n) BUILD_BUG_ON_ZERO(((n) & ((n) - 1)) != 0) /** * FIELD_PREP_CONST() - prepare a constant bitfield element * @_mask: shifted mask defining the field's length and position * @_val: value to put in the field * * FIELD_PREP_CONST() masks and shifts up the value. The result should * be combined with other fields of the bitfield using logical OR. * * Unlike FIELD_PREP() this is a constant expression and can therefore * be used in initializers. Error checking is less comfortable for this * version, and non-constant masks cannot be used. */ #define FIELD_PREP_CONST(_mask, _val) \ ( \ /* mask must be non-zero */ \ BUILD_BUG_ON_ZERO((_mask) == 0) + \ /* check if value fits */ \ BUILD_BUG_ON_ZERO(~((_mask) >> __bf_shf(_mask)) & (_val)) + \ /* check if mask is contiguous */ \ __BF_CHECK_POW2((_mask) + (1ULL << __bf_shf(_mask))) + \ /* and create the value */ \ (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)) \ ) /** * FIELD_GET() - extract a bitfield element * @_mask: shifted mask defining the field's length and position * @_reg: value of entire bitfield * * FIELD_GET() extracts the field specified by @_mask from the * bitfield passed in as @_reg by masking and shifting it down. */ #define FIELD_GET(_mask, _reg) \ ({ \ __BF_FIELD_CHECK_REG(_mask, _reg, "FIELD_GET: "); \ __FIELD_GET(_mask, _reg, "FIELD_GET: "); \ }) /** * FIELD_MODIFY() - modify a bitfield element * @_mask: shifted mask defining the field's length and position * @_reg_p: pointer to the memory that should be updated * @_val: value to store in the bitfield * * FIELD_MODIFY() modifies the set of bits in @_reg_p specified by @_mask, * by replacing them with the bitfield value passed in as @_val. */ #define FIELD_MODIFY(_mask, _reg_p, _val) \ ({ \ typecheck_pointer(_reg_p); \ __BF_FIELD_CHECK(_mask, *(_reg_p), _val, "FIELD_MODIFY: "); \ *(_reg_p) &= ~(_mask); \ *(_reg_p) |= (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)); \ }) extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") __bad_mask(void); static __always_inline u64 field_multiplier(u64 field) { if ((field | (field - 1)) & ((field | (field - 1)) + 1)) __bad_mask(); return field & -field; } static __always_inline u64 field_mask(u64 field) { return field / field_multiplier(field); } #define field_max(field) ((typeof(field))field_mask(field)) #define ____MAKE_OP(type,base,to,from) \ static __always_inline __##type __must_check type##_encode_bits(base v, base field) \ { \ if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ __field_overflow(); \ return to((v & field_mask(field)) * field_multiplier(field)); \ } \ static __always_inline __##type __must_check type##_replace_bits(__##type old, \ base val, base field) \ { \ return (old & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline void type##p_replace_bits(__##type *p, \ base val, base field) \ { \ *p = (*p & ~to(field)) | type##_encode_bits(val, field); \ } \ static __always_inline base __must_check type##_get_bits(__##type v, base field) \ { \ return (from(v) & field)/field_multiplier(field); \ } #define __MAKE_OP(size) \ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ ____MAKE_OP(u##size,u##size,,) ____MAKE_OP(u8,u8,,) __MAKE_OP(16) __MAKE_OP(32) __MAKE_OP(64) #undef __MAKE_OP #undef ____MAKE_OP #define __field_prep(mask, val) \ ({ \ auto __mask = (mask); \ typeof(__mask) __val = (val); \ unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ __ffs(__mask) : __ffs64(__mask); \ (__val << __shift) & __mask; \ }) #define __field_get(mask, reg) \ ({ \ auto __mask = (mask); \ typeof(__mask) __reg = (reg); \ unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ __ffs(__mask) : __ffs64(__mask); \ (__reg & __mask) >> __shift; \ }) /** * field_prep() - prepare a bitfield element * @mask: shifted mask defining the field's length and position, must be * non-zero * @val: value to put in the field * * Return: field value masked and shifted to its final destination * * field_prep() masks and shifts up the value. The result should be * combined with other fields of the bitfield using logical OR. * Unlike FIELD_PREP(), @mask is not limited to a compile-time constant. * Typical usage patterns are a value stored in a table, or calculated by * shifting a constant by a variable number of bits. * If you want to ensure that @mask is a compile-time constant, please use * FIELD_PREP() directly instead. */ #define field_prep(mask, val) \ (__builtin_constant_p(mask) ? __FIELD_PREP(mask, val, "field_prep: ") \ : __field_prep(mask, val)) /** * field_get() - extract a bitfield element * @mask: shifted mask defining the field's length and position, must be * non-zero * @reg: value of entire bitfield * * Return: extracted field value * * field_get() extracts the field specified by @mask from the * bitfield passed in as @reg by masking and shifting it down. * Unlike FIELD_GET(), @mask is not limited to a compile-time constant. * Typical usage patterns are a value stored in a table, or calculated by * shifting a constant by a variable number of bits. * If you want to ensure that @mask is a compile-time constant, please use * FIELD_GET() directly instead. */ #define field_get(mask, reg) \ (__builtin_constant_p(mask) ? __FIELD_GET(mask, reg, "field_get: ") \ : __field_get(mask, reg)) #endif
150 150 150 150 150 150 150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 IBM Corporation * * Author: Cedric Le Goater <clg@fr.ibm.com> */ #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sysctl.h> #include <linux/stat.h> #include <linux/capability.h> #include <linux/slab.h> #include <linux/cred.h> static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; static int msg_maxsize_limit_max = HARD_MSGSIZEMAX; static const struct ctl_table mq_sysctls[] = { { .procname = "queues_max", .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "msg_max", .data = &init_ipc_ns.mq_msg_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, { .procname = "msgsize_max", .data = &init_ipc_ns.mq_msgsize_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, { .procname = "msg_default", .data = &init_ipc_ns.mq_msg_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, { .procname = "msgsize_default", .data = &init_ipc_ns.mq_msgsize_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) { return &current->nsproxy->ipc_ns->mq_set; } static int set_is_seen(struct ctl_table_set *set) { return &current->nsproxy->ipc_ns->mq_set == set; } static void mq_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, mq_set); kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; kuid_t ns_root_uid; kgid_t ns_root_gid; mq_set_ownership(head, &ns_root_uid, &ns_root_gid); if (uid_eq(current_euid(), ns_root_uid)) mode >>= 6; else if (in_egroup_p(ns_root_gid)) mode >>= 3; mode &= 7; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = mq_permissions, .set_ownership = mq_set_ownership, }; bool setup_mq_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen); tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) { if (tbl[i].data == &init_ipc_ns.mq_queues_max) tbl[i].data = &ns->mq_queues_max; else if (tbl[i].data == &init_ipc_ns.mq_msg_max) tbl[i].data = &ns->mq_msg_max; else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max) tbl[i].data = &ns->mq_msgsize_max; else if (tbl[i].data == &init_ipc_ns.mq_msg_default) tbl[i].data = &ns->mq_msg_default; else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default) tbl[i].data = &ns->mq_msgsize_default; else tbl[i].data = NULL; } ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl, ARRAY_SIZE(mq_sysctls)); } if (!ns->mq_sysctls) { kfree(tbl); retire_sysctl_set(&ns->mq_set); return false; } return true; } void retire_mq_sysctls(struct ipc_namespace *ns) { const struct ctl_table *tbl; tbl = ns->mq_sysctls->ctl_table_arg; unregister_sysctl_table(ns->mq_sysctls); retire_sysctl_set(&ns->mq_set); kfree(tbl); }
236 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); /* RFC 4884 */ #define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128 #define ICMP_EXT_VERSION_2 2 /* ICMP Extension Object Classes */ #define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */ /* Interface Information Object - RFC 5837 */ enum { ICMP_EXT_CTYPE_IIO_ROLE_IIF, }; #define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6) #define ICMP_EXT_CTYPE_IIO_MTU BIT(0) #define ICMP_EXT_CTYPE_IIO_NAME BIT(1) #define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2) #define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3) struct icmp_ext_iio_name_subobj { u8 len; char name[IFNAMSIZ]; }; enum { /* RFC 5837 - Incoming IP Interface Role */ ICMP_ERR_EXT_IIO_IIF, /* Add new constants above. Used by "icmp_errors_extension_mask" * sysctl. */ ICMP_ERR_EXT_COUNT, }; #endif /* _LINUX_ICMP_H */
30 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima.h * internal Integrity Measurement Architecture (IMA) definitions */ #ifndef __LINUX_IMA_H #define __LINUX_IMA_H #include <linux/types.h> #include <linux/crypto.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/hash.h> #include <linux/tpm.h> #include <linux/audit.h> #include <crypto/hash_info.h> #include "../integrity.h" enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN, IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII }; enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 }; /* digest size for IMA, fits SHA1 or MD5 */ #define IMA_DIGEST_SIZE SHA1_DIGEST_SIZE #define IMA_EVENT_NAME_LEN_MAX 255 #define IMA_HASH_BITS 10 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) #define IMA_TEMPLATE_FIELD_ID_MAX_LEN 16 #define IMA_TEMPLATE_NUM_FIELDS_MAX 15 #define IMA_TEMPLATE_IMA_NAME "ima" #define IMA_TEMPLATE_IMA_FMT "d|n" #define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0) /* current content of the policy */ extern int ima_policy_flag; /* bitset of digests algorithms allowed in the setxattr hook */ extern atomic_t ima_setxattr_allowed_hash_algorithms; /* IMA hash algorithm description */ struct ima_algo_desc { struct crypto_shash *tfm; enum hash_algo algo; }; /* set during initialization */ extern int ima_hash_algo __ro_after_init; extern int ima_sha1_idx __ro_after_init; extern int ima_hash_algo_idx __ro_after_init; extern int ima_extra_slots __ro_after_init; extern struct ima_algo_desc *ima_algo_array __ro_after_init; extern int ima_appraise; extern struct tpm_chip *ima_tpm_chip; extern const char boot_aggregate_name[]; /* IMA event related data */ struct ima_event_data { struct ima_iint_cache *iint; struct file *file; const unsigned char *filename; struct evm_ima_xattr_data *xattr_value; int xattr_len; const struct modsig *modsig; const char *violation; const void *buf; int buf_len; }; /* IMA template field data definition */ struct ima_field_data { u8 *data; u32 len; }; /* IMA template field definition */ struct ima_template_field { const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN]; int (*field_init)(struct ima_event_data *event_data, struct ima_field_data *field_data); void (*field_show)(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data); }; /* IMA template descriptor definition */ struct ima_template_desc { struct list_head list; char *name; char *fmt; int num_fields; const struct ima_template_field **fields; }; struct ima_template_entry { int pcr; struct tpm_digest *digests; struct ima_template_desc *template_desc; /* template descriptor */ u32 template_data_len; struct ima_field_data template_data[]; /* template related data */ }; struct ima_queue_entry { struct hlist_node hnext; /* place in hash collision list */ struct list_head later; /* place in ima_measurements list */ struct ima_template_entry *entry; }; extern struct list_head ima_measurements; /* list of all measurements */ /* Some details preceding the binary serialized measurement list */ struct ima_kexec_hdr { u16 version; u16 _reserved0; u32 _reserved1; u64 buffer_size; u64 count; }; /* IMA iint action cache flags */ #define IMA_MEASURE 0x00000001 #define IMA_MEASURED 0x00000002 #define IMA_APPRAISE 0x00000004 #define IMA_APPRAISED 0x00000008 /*#define IMA_COLLECT 0x00000010 do not use this flag */ #define IMA_COLLECTED 0x00000020 #define IMA_AUDIT 0x00000040 #define IMA_AUDITED 0x00000080 #define IMA_HASH 0x00000100 #define IMA_HASHED 0x00000200 /* IMA iint policy rule cache flags */ #define IMA_NONACTION_FLAGS 0xff000000 #define IMA_DIGSIG_REQUIRED 0x01000000 #define IMA_PERMIT_DIRECTIO 0x02000000 #define IMA_NEW_FILE 0x04000000 #define IMA_FAIL_UNVERIFIABLE_SIGS 0x10000000 #define IMA_MODSIG_ALLOWED 0x20000000 #define IMA_CHECK_BLACKLIST 0x40000000 #define IMA_VERITY_REQUIRED 0x80000000 /* Exclude non-action flags which are not rule-specific. */ #define IMA_NONACTION_RULE_FLAGS (IMA_NONACTION_FLAGS & ~IMA_NEW_FILE) #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) #define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \ IMA_HASHED | IMA_COLLECTED | \ IMA_APPRAISED_SUBMASK) /* IMA iint subaction appraise cache flags */ #define IMA_FILE_APPRAISE 0x00001000 #define IMA_FILE_APPRAISED 0x00002000 #define IMA_MMAP_APPRAISE 0x00004000 #define IMA_MMAP_APPRAISED 0x00008000 #define IMA_BPRM_APPRAISE 0x00010000 #define IMA_BPRM_APPRAISED 0x00020000 #define IMA_READ_APPRAISE 0x00040000 #define IMA_READ_APPRAISED 0x00080000 #define IMA_CREDS_APPRAISE 0x00100000 #define IMA_CREDS_APPRAISED 0x00200000 #define IMA_APPRAISE_SUBMASK (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \ IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \ IMA_CREDS_APPRAISE) #define IMA_APPRAISED_SUBMASK (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \ IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \ IMA_CREDS_APPRAISED) /* IMA iint cache atomic_flags */ #define IMA_CHANGE_XATTR 0 #define IMA_UPDATE_XATTR 1 #define IMA_CHANGE_ATTR 2 #define IMA_DIGSIG 3 #define IMA_MAY_EMIT_TOMTOU 4 #define IMA_EMITTED_OPENWRITERS 5 /* IMA integrity metadata associated with an inode */ struct ima_iint_cache { struct mutex mutex; /* protects: version, flags, digest */ struct integrity_inode_attributes real_inode; unsigned long flags; unsigned long measured_pcrs; unsigned long atomic_flags; enum integrity_status ima_file_status:4; enum integrity_status ima_mmap_status:4; enum integrity_status ima_bprm_status:4; enum integrity_status ima_read_status:4; enum integrity_status ima_creds_status:4; struct ima_digest_data *ima_hash; }; extern struct lsm_blob_sizes ima_blob_sizes; static inline struct ima_iint_cache * ima_inode_get_iint(const struct inode *inode) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return NULL; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; return *iint_sec; } static inline void ima_inode_set_iint(const struct inode *inode, struct ima_iint_cache *iint) { struct ima_iint_cache **iint_sec; if (unlikely(!inode->i_security)) return; iint_sec = inode->i_security + ima_blob_sizes.lbs_inode; *iint_sec = iint; } struct ima_iint_cache *ima_iint_find(struct inode *inode); struct ima_iint_cache *ima_inode_get(struct inode *inode); void ima_inode_free_rcu(void *inode_security); void __init ima_iintcache_init(void); extern const int read_idmap[]; #ifdef CONFIG_HAVE_IMA_KEXEC void ima_load_kexec_buffer(void); #else static inline void ima_load_kexec_buffer(void) {} #endif /* CONFIG_HAVE_IMA_KEXEC */ #ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS void ima_post_key_create_or_update(struct key *keyring, struct key *key, const void *payload, size_t plen, unsigned long flags, bool create); #endif #ifdef CONFIG_IMA_KEXEC void ima_measure_kexec_event(const char *event_name); #else static inline void ima_measure_kexec_event(const char *event_name) {} #endif /* * The default binary_runtime_measurements list format is defined as the * platform native format. The canonical format is defined as little-endian. */ extern bool ima_canonical_fmt; /* Internal IMA function definitions */ int ima_init(void); int ima_fs_init(void); int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename); int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash); int ima_calc_buffer_hash(const void *buf, loff_t len, struct ima_digest_data *hash); int ima_calc_field_array_hash(struct ima_field_data *field_data, struct ima_template_entry *entry); int ima_calc_boot_aggregate(struct ima_digest_data *hash); void ima_add_violation(struct file *file, const unsigned char *filename, struct ima_iint_cache *iint, const char *op, const char *cause); int ima_init_crypto(void); void ima_putc(struct seq_file *m, void *data, int datalen); void ima_print_digest(struct seq_file *m, u8 *digest, u32 size); int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); struct ima_template_desc *ima_template_desc_buf(void); struct ima_template_desc *lookup_template_desc(const char *name); bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); int ima_measurements_show(struct seq_file *m, void *v); unsigned long ima_get_binary_runtime_size(void); int ima_init_template(void); void ima_init_template_list(void); int __init ima_init_digests(void); void __init ima_init_reboot_notifier(void); int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event, void *lsm_data); /* * used to protect h_table and sha_table */ extern spinlock_t ima_queue_lock; struct ima_h_table { atomic_long_t len; /* number of stored measurements in the list */ atomic_long_t violations; struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE]; }; extern struct ima_h_table ima_htable; static inline unsigned int ima_hash_key(u8 *digest) { /* there is no point in taking a hash of part of a digest */ return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE; } #define __ima_hooks(hook) \ hook(NONE, none) \ hook(FILE_CHECK, file) \ hook(MMAP_CHECK, mmap) \ hook(MMAP_CHECK_REQPROT, mmap_reqprot) \ hook(BPRM_CHECK, bprm) \ hook(CREDS_CHECK, creds) \ hook(POST_SETATTR, post_setattr) \ hook(MODULE_CHECK, module) \ hook(FIRMWARE_CHECK, firmware) \ hook(KEXEC_KERNEL_CHECK, kexec_kernel) \ hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs) \ hook(POLICY_CHECK, policy) \ hook(KEXEC_CMDLINE, kexec_cmdline) \ hook(KEY_CHECK, key) \ hook(CRITICAL_DATA, critical_data) \ hook(SETXATTR_CHECK, setxattr_check) \ hook(MAX_CHECK, none) #define __ima_hook_enumify(ENUM, str) ENUM, #define __ima_stringify(arg) (#arg) #define __ima_hook_measuring_stringify(ENUM, str) \ (__ima_stringify(measuring_ ##str)), enum ima_hooks { __ima_hooks(__ima_hook_enumify) }; static const char * const ima_hooks_measure_str[] = { __ima_hooks(__ima_hook_measuring_stringify) }; static inline const char *func_measure_str(enum ima_hooks func) { if (func >= MAX_CHECK) return ima_hooks_measure_str[NONE]; return ima_hooks_measure_str[func]; } extern const char *const func_tokens[]; struct modsig; #ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS /* * To track keys that need to be measured. */ struct ima_key_entry { struct list_head list; void *payload; size_t payload_len; char *keyring_name; }; void ima_init_key_queue(void); bool ima_should_queue_key(void); bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len); void ima_process_queued_keys(void); #else static inline void ima_init_key_queue(void) {} static inline bool ima_should_queue_key(void) { return false; } static inline bool ima_queue_key(struct key *keyring, const void *payload, size_t payload_len) { return false; } static inline void ima_process_queued_keys(void) {} #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */ /* LIM API function definitions */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func); int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig); void ima_store_measurement(struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); int process_buffer_measurement(struct mnt_idmap *idmap, struct inode *inode, const void *buf, int size, const char *eventname, enum ima_hooks func, int pcr, const char *func_data, bool buf_hash, u8 *digest, size_t digest_len); void ima_audit_measurement(struct ima_iint_cache *iint, const unsigned char *filename); int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *template_desc); int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr); void ima_free_template_entry(struct ima_template_entry *entry); const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); void ima_init_policy(void); void ima_update_policy(void); void ima_update_policy_flags(void); ssize_t ima_parse_add_rule(char *); void ima_delete_rules(void); int ima_check_policy(void); void *ima_policy_start(struct seq_file *m, loff_t *pos); void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos); void ima_policy_stop(struct seq_file *m, void *v); int ima_policy_show(struct seq_file *m, void *v); /* Appraise integrity measurements */ #define IMA_APPRAISE_ENFORCE 0x01 #define IMA_APPRAISE_FIX 0x02 #define IMA_APPRAISE_LOG 0x04 #define IMA_APPRAISE_MODULES 0x08 #define IMA_APPRAISE_FIRMWARE 0x10 #define IMA_APPRAISE_POLICY 0x20 #define IMA_APPRAISE_KEXEC 0x40 #ifdef CONFIG_IMA_APPRAISE int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr); int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, bool bprm_is_check); int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct ima_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func); enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len); int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len); void __init init_ima_appraise_lsm(const struct lsm_id *lsmid); #else static inline int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { return 0; } static inline int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, bool bprm_is_check) { return INTEGRITY_UNKNOWN; } static inline int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { return 0; } static inline void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { } static inline enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { return INTEGRITY_UNKNOWN; } static inline enum hash_algo ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len) { return ima_hash_algo; } static inline int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { return 0; } static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { } #endif /* CONFIG_IMA_APPRAISE */ #ifdef CONFIG_IMA_APPRAISE_MODSIG int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig); void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size); int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size); int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len); void ima_free_modsig(struct modsig *modsig); #else static inline int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { return -EOPNOTSUPP; } static inline void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { } static inline int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { return -EOPNOTSUPP; } static inline int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { return -EOPNOTSUPP; } static inline void ima_free_modsig(struct modsig *modsig) { } #endif /* CONFIG_IMA_APPRAISE_MODSIG */ /* LSM based policy rules require audit */ #ifdef CONFIG_IMA_LSM_RULES #define ima_filter_rule_init security_audit_rule_init #define ima_filter_rule_free security_audit_rule_free #define ima_filter_rule_match security_audit_rule_match #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) { return -EINVAL; } static inline void ima_filter_rule_free(void *lsmrule) { } static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *lsmrule) { return -EINVAL; } #endif /* CONFIG_IMA_LSM_RULES */ #ifdef CONFIG_IMA_READ_POLICY #define POLICY_FILE_FLAGS (S_IWUSR | S_IRUSR) #else #define POLICY_FILE_FLAGS S_IWUSR #endif /* CONFIG_IMA_READ_POLICY */ #endif /* __LINUX_IMA_H */
5 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_NPT.h> #include <linux/netfilter/x_tables.h> static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) { struct ip6t_npt_tginfo *npt = par->targinfo; struct in6_addr pfx; __wsum src_sum, dst_sum; if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) return -EINVAL; /* Ensure that LSB of prefix is zero */ ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6)) return -EINVAL; ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) return -EINVAL; src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); return 0; } static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, struct in6_addr *addr) { unsigned int pfx_len; unsigned int i, idx; __be32 mask; __sum16 sum; pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len); for (i = 0; i < pfx_len; i += 32) { if (pfx_len - i >= 32) mask = 0; else mask = htonl((1 << (i - pfx_len + 32)) - 1); idx = i / 32; addr->s6_addr32[idx] &= mask; addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx]; } if (pfx_len <= 48) idx = 3; else { for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) { if ((__force __sum16)addr->s6_addr16[idx] != CSUM_MANGLED_0) break; } if (idx == ARRAY_SIZE(addr->s6_addr16)) return false; } sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]), csum_unfold(npt->adjustment))); if (sum == CSUM_MANGLED_0) sum = 0; *(__force __sum16 *)&addr->s6_addr16[idx] = sum; return true; } static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb, struct ipv6hdr *_bounced_hdr) { if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NULL; if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type)) return NULL; return skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct icmp6hdr), sizeof(struct ipv6hdr), _bounced_hdr); } static unsigned int ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, saddr)); return NF_DROP; } /* rewrite dst addr of bounced packet which was sent to dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->daddr); } return XT_CONTINUE; } static unsigned int ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, daddr)); return NF_DROP; } /* rewrite src addr of bounced packet which was sent from dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->saddr); } return XT_CONTINUE; } static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }, { .name = "DNPT", .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init ip6t_npt_init(void) { return xt_register_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } static void __exit ip6t_npt_exit(void) { xt_unregister_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } module_init(ip6t_npt_init); module_exit(ip6t_npt_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ip6t_SNPT"); MODULE_ALIAS("ip6t_DNPT");
2 2 62 54 12 30 2 28 12 9 3 3 9 74 2 2 62 68 2 4 3 59 63 156 2 157 13 1 12 7 3 189 1 17 171 13 158 1 15 41 1 34 6 22 18 6 34 22 18 18 5 1 1 3 67 130 5 556 561 14 2 6 2 2 10 2 30 1 4 96 1 93 7 94 465 70 36 110 24 397 508 403 345 22 34 13 397 160 345 322 400 504 433 431 432 67 41 466 463 465 12 1 12 7 496 17 482 482 461 27 484 481 233 149 83 2 84 63 20 92 91 82 12 91 92 129 93 45 21 13 31 1 113 3 6 88 27 4 47 96 13 10 96 19 96 95 13 4 93 8 90 21 73 112 504 507 15 15 15 507 2 499 507 504 16 428 65 431 5 430 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 // SPDX-License-Identifier: GPL-2.0-or-later /* * af_alg: User-space algorithm interface * * This file provides the user-space API for algorithms. * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/atomic.h> #include <crypto/if_alg.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/key.h> #include <linux/key-type.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/security.h> #include <linux/string.h> #include <keys/user-type.h> #include <keys/trusted-type.h> #include <keys/encrypted-type.h> struct alg_type_list { const struct af_alg_type *type; struct list_head list; }; static struct proto alg_proto = { .name = "ALG", .owner = THIS_MODULE, .obj_size = sizeof(struct alg_sock), }; static LIST_HEAD(alg_types); static DECLARE_RWSEM(alg_types_sem); static const struct af_alg_type *alg_get_type(const char *name) { const struct af_alg_type *type = ERR_PTR(-ENOENT); struct alg_type_list *node; down_read(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, name)) continue; if (try_module_get(node->type->owner)) type = node->type; break; } up_read(&alg_types_sem); return type; } int af_alg_register_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -EEXIST; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (!strcmp(node->type->name, type->name)) goto unlock; } node = kmalloc_obj(*node); err = -ENOMEM; if (!node) goto unlock; type->ops->owner = THIS_MODULE; if (type->ops_nokey) type->ops_nokey->owner = THIS_MODULE; node->type = type; list_add(&node->list, &alg_types); err = 0; unlock: up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_register_type); int af_alg_unregister_type(const struct af_alg_type *type) { struct alg_type_list *node; int err = -ENOENT; down_write(&alg_types_sem); list_for_each_entry(node, &alg_types, list) { if (strcmp(node->type->name, type->name)) continue; list_del(&node->list); kfree(node); err = 0; break; } up_write(&alg_types_sem); return err; } EXPORT_SYMBOL_GPL(af_alg_unregister_type); static void alg_do_release(const struct af_alg_type *type, void *private) { if (!type) return; type->release(private); module_put(type->owner); } int af_alg_release(struct socket *sock) { if (sock->sk) { sock_put(sock->sk); sock->sk = NULL; } return 0; } EXPORT_SYMBOL_GPL(af_alg_release); void af_alg_release_parent(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); unsigned int nokey = atomic_read(&ask->nokey_refcnt); sk = ask->parent; ask = alg_sk(sk); if (nokey) atomic_dec(&ask->nokey_refcnt); if (atomic_dec_and_test(&ask->refcnt)) sock_put(sk); } EXPORT_SYMBOL_GPL(af_alg_release_parent); static int alg_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY; struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct sockaddr_alg_new *sa = (void *)uaddr; const struct af_alg_type *type; void *private; int err; if (sock->state == SS_CONNECTED) return -EINVAL; BUILD_BUG_ON(offsetof(struct sockaddr_alg_new, salg_name) != offsetof(struct sockaddr_alg, salg_name)); BUILD_BUG_ON(offsetof(struct sockaddr_alg, salg_name) != sizeof(*sa)); if (addr_len < sizeof(*sa) + 1) return -EINVAL; /* If caller uses non-allowed flag, return error. */ if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) return -EINVAL; sa->salg_type[sizeof(sa->salg_type) - 1] = 0; sa->salg_name[addr_len - sizeof(*sa) - 1] = 0; type = alg_get_type(sa->salg_type); if (PTR_ERR(type) == -ENOENT) { request_module("algif-%s", sa->salg_type); type = alg_get_type(sa->salg_type); } if (IS_ERR(type)) return PTR_ERR(type); private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask); if (IS_ERR(private)) { module_put(type->owner); return PTR_ERR(private); } err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt)) goto unlock; swap(ask->type, type); swap(ask->private, private); err = 0; unlock: release_sock(sk); alg_do_release(type, private); return err; } static int alg_setkey(struct sock *sk, sockptr_t ukey, unsigned int keylen) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type = ask->type; u8 *key; int err; key = sock_kmalloc(sk, keylen, GFP_KERNEL); if (!key) return -ENOMEM; err = -EFAULT; if (copy_from_sockptr(key, ukey, keylen)) goto out; err = type->setkey(ask->private, key, keylen); out: sock_kzfree_s(sk, key, keylen); return err; } #ifdef CONFIG_KEYS static const u8 *key_data_ptr_user(const struct key *key, unsigned int *datalen) { const struct user_key_payload *ukp; ukp = user_key_payload_locked(key); if (IS_ERR_OR_NULL(ukp)) return ERR_PTR(-EKEYREVOKED); *datalen = key->datalen; return ukp->data; } static const u8 *key_data_ptr_encrypted(const struct key *key, unsigned int *datalen) { const struct encrypted_key_payload *ekp; ekp = dereference_key_locked(key); if (IS_ERR_OR_NULL(ekp)) return ERR_PTR(-EKEYREVOKED); *datalen = ekp->decrypted_datalen; return ekp->decrypted_data; } static const u8 *key_data_ptr_trusted(const struct key *key, unsigned int *datalen) { const struct trusted_key_payload *tkp; tkp = dereference_key_locked(key); if (IS_ERR_OR_NULL(tkp)) return ERR_PTR(-EKEYREVOKED); *datalen = tkp->key_len; return tkp->key; } static struct key *lookup_key(key_serial_t serial) { key_ref_t key_ref; key_ref = lookup_user_key(serial, 0, KEY_NEED_SEARCH); if (IS_ERR(key_ref)) return ERR_CAST(key_ref); return key_ref_to_ptr(key_ref); } static int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { const struct af_alg_type *type = ask->type; u8 *key_data = NULL; unsigned int key_datalen; key_serial_t serial; struct key *key; const u8 *ret; int err; if (optlen != sizeof(serial)) return -EINVAL; if (copy_from_sockptr(&serial, optval, optlen)) return -EFAULT; key = lookup_key(serial); if (IS_ERR(key)) return PTR_ERR(key); down_read(&key->sem); ret = ERR_PTR(-ENOPROTOOPT); if (!strcmp(key->type->name, "user") || !strcmp(key->type->name, "logon")) { ret = key_data_ptr_user(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_ENCRYPTED_KEYS) && !strcmp(key->type->name, "encrypted")) { ret = key_data_ptr_encrypted(key, &key_datalen); } else if (IS_REACHABLE(CONFIG_TRUSTED_KEYS) && !strcmp(key->type->name, "trusted")) { ret = key_data_ptr_trusted(key, &key_datalen); } if (IS_ERR(ret)) { up_read(&key->sem); key_put(key); return PTR_ERR(ret); } key_data = sock_kmalloc(&ask->sk, key_datalen, GFP_KERNEL); if (!key_data) { up_read(&key->sem); key_put(key); return -ENOMEM; } memcpy(key_data, ret, key_datalen); up_read(&key->sem); key_put(key); err = type->setkey(ask->private, key_data, key_datalen); sock_kzfree_s(&ask->sk, key_data, key_datalen); return err; } #else static inline int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } #endif static int alg_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; int err = -EBUSY; lock_sock(sk); if (atomic_read(&ask->refcnt) != atomic_read(&ask->nokey_refcnt)) goto unlock; type = ask->type; err = -ENOPROTOOPT; if (level != SOL_ALG || !type) goto unlock; switch (optname) { case ALG_SET_KEY: case ALG_SET_KEY_BY_KEY_SERIAL: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setkey) goto unlock; if (optname == ALG_SET_KEY_BY_KEY_SERIAL) err = alg_setkey_by_key_serial(ask, optval, optlen); else err = alg_setkey(sk, optval, optlen); break; case ALG_SET_AEAD_AUTHSIZE: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setauthsize) goto unlock; err = type->setauthsize(ask->private, optlen); break; case ALG_SET_DRBG_ENTROPY: if (sock->state == SS_CONNECTED) goto unlock; if (!type->setentropy) goto unlock; err = type->setentropy(ask->private, optval, optlen); } unlock: release_sock(sk); return err; } int af_alg_accept(struct sock *sk, struct socket *newsock, struct proto_accept_arg *arg) { struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; struct sock *sk2; unsigned int nokey; int err; lock_sock(sk); type = ask->type; err = -EINVAL; if (!type) goto unlock; sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, arg->kern); err = -ENOMEM; if (!sk2) goto unlock; sock_init_data(newsock, sk2); security_sock_graft(sk2, newsock); security_sk_clone(sk, sk2); /* * newsock->ops assigned here to allow type->accept call to override * them when required. */ newsock->ops = type->ops; err = type->accept(ask->private, sk2); nokey = err == -ENOKEY; if (nokey && type->accept_nokey) err = type->accept_nokey(ask->private, sk2); if (err) goto unlock; if (atomic_inc_return_relaxed(&ask->refcnt) == 1) sock_hold(sk); if (nokey) { atomic_inc(&ask->nokey_refcnt); atomic_set(&alg_sk(sk2)->nokey_refcnt, 1); } alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; newsock->state = SS_CONNECTED; if (nokey) newsock->ops = type->ops_nokey; err = 0; unlock: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(af_alg_accept); static int alg_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { return af_alg_accept(sock->sk, newsock, arg); } static const struct proto_ops alg_proto_ops = { .family = PF_ALG, .owner = THIS_MODULE, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .bind = alg_bind, .release = af_alg_release, .setsockopt = alg_setsockopt, .accept = alg_accept, }; static void alg_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); alg_do_release(ask->type, ask->private); } static int alg_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int err; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; if (protocol != 0) return -EPROTONOSUPPORT; err = -ENOMEM; sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern); if (!sk) goto out; sock->ops = &alg_proto_ops; sock_init_data(sock, sk); sk->sk_destruct = alg_sock_destruct; return 0; out: return err; } static const struct net_proto_family alg_family = { .family = PF_ALG, .create = alg_create, .owner = THIS_MODULE, }; static void af_alg_link_sg(struct af_alg_sgl *sgl_prev, struct af_alg_sgl *sgl_new) { sg_unmark_end(sgl_prev->sgt.sgl + sgl_prev->sgt.nents - 1); sg_chain(sgl_prev->sgt.sgl, sgl_prev->sgt.nents + 1, sgl_new->sgt.sgl); } void af_alg_free_sg(struct af_alg_sgl *sgl) { int i; if (sgl->sgt.sgl) { if (sgl->need_unpin) for (i = 0; i < sgl->sgt.nents; i++) unpin_user_page(sg_page(&sgl->sgt.sgl[i])); if (sgl->sgt.sgl != sgl->sgl) kvfree(sgl->sgt.sgl); sgl->sgt.sgl = NULL; } } EXPORT_SYMBOL_GPL(af_alg_free_sg); static int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con) { struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_ALG) continue; switch (cmsg->cmsg_type) { case ALG_SET_IV: if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv))) return -EINVAL; con->iv = (void *)CMSG_DATA(cmsg); if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen + sizeof(*con->iv))) return -EINVAL; break; case ALG_SET_OP: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->op = *(u32 *)CMSG_DATA(cmsg); break; case ALG_SET_AEAD_ASSOCLEN: if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) return -EINVAL; con->aead_assoclen = *(u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } /** * af_alg_alloc_tsgl - allocate the TX SGL * * @sk: socket of connection to user space * Return: 0 upon success, < 0 upon error */ static int af_alg_alloc_tsgl(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg = NULL; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); if (!list_empty(&ctx->tsgl_list)) sg = sgl->sg; if (!sg || sgl->cur >= MAX_SGL_ENTS) { sgl = sock_kmalloc(sk, struct_size(sgl, sg, (MAX_SGL_ENTS + 1)), GFP_KERNEL); if (!sgl) return -ENOMEM; sg_init_table(sgl->sg, MAX_SGL_ENTS + 1); sgl->cur = 0; if (sg) sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg); list_add_tail(&sgl->list, &ctx->tsgl_list); } return 0; } /** * af_alg_count_tsgl - Count number of TX SG entries * * The counting starts from the beginning of the SGL to @bytes. If * an @offset is provided, the counting of the SG entries starts at the @offset. * * @sk: socket of connection to user space * @bytes: Count the number of SG entries holding given number of bytes. * @offset: Start the counting of SG entries from the given offset. * Return: Number of TX SG entries found given the constraints */ unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset) { const struct alg_sock *ask = alg_sk(sk); const struct af_alg_ctx *ctx = ask->private; const struct af_alg_tsgl *sgl; unsigned int i; unsigned int sgl_count = 0; if (!bytes) return 0; list_for_each_entry(sgl, &ctx->tsgl_list, list) { const struct scatterlist *sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { size_t bytes_count; /* Skip offset */ if (offset >= sg[i].length) { offset -= sg[i].length; bytes -= sg[i].length; continue; } bytes_count = sg[i].length - offset; offset = 0; sgl_count++; /* If we have seen requested number of bytes, stop */ if (bytes_count >= bytes) return sgl_count; bytes -= bytes_count; } } return sgl_count; } EXPORT_SYMBOL_GPL(af_alg_count_tsgl); /** * af_alg_pull_tsgl - Release the specified buffers from TX SGL * * If @dst is non-null, reassign the pages to @dst. The caller must release * the pages. If @dst_offset is given only reassign the pages to @dst starting * at the @dst_offset (byte). The caller must ensure that @dst is large * enough (e.g. by using af_alg_count_tsgl with the same offset). * * @sk: socket of connection to user space * @used: Number of bytes to pull from TX SGL * @dst: If non-NULL, buffer is reassigned to dst SGL instead of releasing. The * caller must release the buffers in dst. * @dst_offset: Reassign the TX SGL from given offset. All buffers before * reaching the offset is released. */ void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst, size_t dst_offset) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg; unsigned int i, j = 0; while (!list_empty(&ctx->tsgl_list)) { sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list); sg = sgl->sg; for (i = 0; i < sgl->cur; i++) { size_t plen = min_t(size_t, used, sg[i].length); struct page *page = sg_page(sg + i); if (!page) continue; /* * Assumption: caller created af_alg_count_tsgl(len) * SG entries in dst. */ if (dst) { if (dst_offset >= plen) { /* discard page before offset */ dst_offset -= plen; } else { /* reassign page to dst after offset */ get_page(page); sg_set_page(dst + j, page, plen - dst_offset, sg[i].offset + dst_offset); dst_offset = 0; j++; } } sg[i].length -= plen; sg[i].offset += plen; used -= plen; ctx->used -= plen; if (sg[i].length) return; put_page(page); sg_assign_page(sg + i, NULL); } list_del(&sgl->list); sock_kfree_s(sk, sgl, struct_size(sgl, sg, MAX_SGL_ENTS + 1)); } if (!ctx->used) ctx->merge = 0; ctx->init = ctx->more; } EXPORT_SYMBOL_GPL(af_alg_pull_tsgl); /** * af_alg_free_areq_sgls - Release TX and RX SGLs of the request * * @areq: Request holding the TX and RX SGL */ static void af_alg_free_areq_sgls(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_rsgl *rsgl, *tmp; struct scatterlist *tsgl; struct scatterlist *sg; unsigned int i; list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) { atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused); af_alg_free_sg(&rsgl->sgl); list_del(&rsgl->list); if (rsgl != &areq->first_rsgl) sock_kfree_s(sk, rsgl, sizeof(*rsgl)); } tsgl = areq->tsgl; if (tsgl) { for_each_sg(tsgl, sg, areq->tsgl_entries, i) { if (!sg_page(sg)) continue; put_page(sg_page(sg)); } sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl)); } } /** * af_alg_wait_for_wmem - wait for availability of writable memory * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * Return: 0 when writable memory is available, < 0 upon error */ static int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int err = -ERESTARTSYS; long timeout; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, af_alg_writable(sk), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); return err; } /** * af_alg_wmem_wakeup - wakeup caller when writable memory is available * * @sk: socket of connection to user space */ void af_alg_wmem_wakeup(struct sock *sk) { struct socket_wq *wq; if (!af_alg_writable(sk)) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); /** * af_alg_wait_for_data - wait for availability of TX data * * @sk: socket of connection to user space * @flags: If MSG_DONTWAIT is set, then only report if function would sleep * @min: Set to minimum request size if partial requests are allowed. * Return: 0 when writable memory is available, < 0 upon error */ int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; long timeout; int err = -ERESTARTSYS; if (flags & MSG_DONTWAIT) return -EAGAIN; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); for (;;) { if (signal_pending(current)) break; timeout = MAX_SCHEDULE_TIMEOUT; if (sk_wait_event(sk, &timeout, ctx->init && (!ctx->more || (min && ctx->used >= min)), &wait)) { err = 0; break; } } remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); return err; } EXPORT_SYMBOL_GPL(af_alg_wait_for_data); /** * af_alg_data_wakeup - wakeup caller when new data can be sent to kernel * * @sk: socket of connection to user space */ static void af_alg_data_wakeup(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct socket_wq *wq; if (!ctx->used) return; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } /** * af_alg_sendmsg - implementation of sendmsg system call handler * * The sendmsg system call handler obtains the user data and stores it * in ctx->tsgl_list. This implies allocation of the required numbers of * struct af_alg_tsgl. * * In addition, the ctx is filled with the information sent via CMSG. * * @sock: socket of connection to user space * @msg: message from user space * @size: size of message from user space * @ivsize: the size of the IV for the cipher operation to verify that the * user-space-provided IV has the right size * Return: the number of copied data upon success, < 0 upon error */ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct af_alg_control con = {}; long copied = 0; bool enc = false; bool init = false; int err = 0; if (msg->msg_controllen) { err = af_alg_cmsg_send(msg, &con); if (err) return err; init = true; switch (con.op) { case ALG_OP_ENCRYPT: enc = true; break; case ALG_OP_DECRYPT: enc = false; break; default: return -EINVAL; } if (con.iv && con.iv->ivlen != ivsize) return -EINVAL; } lock_sock(sk); if (ctx->write) { release_sock(sk); return -EBUSY; } ctx->write = true; if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; goto unlock; } pr_info_once( "%s sent an empty control message without MSG_MORE.\n", current->comm); } ctx->init = true; if (init) { ctx->enc = enc; if (con.iv) memcpy(ctx->iv, con.iv->iv, ivsize); ctx->aead_assoclen = con.aead_assoclen; } while (size) { struct scatterlist *sg; size_t len = size; ssize_t plen; /* use the existing memory in an allocated page */ if (ctx->merge && !(msg->msg_flags & MSG_SPLICE_PAGES)) { sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg + sgl->cur - 1; len = min_t(size_t, len, PAGE_SIZE - sg->offset - sg->length); err = memcpy_from_msg(page_address(sg_page(sg)) + sg->offset + sg->length, msg, len); if (err) goto unlock; sg->length += len; ctx->merge = (sg->offset + sg->length) & (PAGE_SIZE - 1); ctx->used += len; copied += len; size -= len; continue; } ctx->merge = 0; if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) goto unlock; } /* allocate a new page */ len = min_t(unsigned long, len, af_alg_sndbuf(sk)); err = af_alg_alloc_tsgl(sk); if (err) goto unlock; sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list); sg = sgl->sg; if (sgl->cur) sg_unmark_end(sg + sgl->cur - 1); if (msg->msg_flags & MSG_SPLICE_PAGES) { struct sg_table sgtable = { .sgl = sg, .nents = sgl->cur, .orig_nents = sgl->cur, }; plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable, MAX_SGL_ENTS - sgl->cur, 0); if (plen < 0) { err = plen; goto unlock; } for (; sgl->cur < sgtable.nents; sgl->cur++) get_page(sg_page(&sg[sgl->cur])); len -= plen; ctx->used += plen; copied += plen; size -= plen; } else { do { struct page *pg; unsigned int i = sgl->cur; plen = min_t(size_t, len, PAGE_SIZE); pg = alloc_page(GFP_KERNEL); if (!pg) { err = -ENOMEM; goto unlock; } sg_assign_page(sg + i, pg); err = memcpy_from_msg( page_address(sg_page(sg + i)), msg, plen); if (err) { __free_page(sg_page(sg + i)); sg_assign_page(sg + i, NULL); goto unlock; } sg[i].length = plen; len -= plen; ctx->used += plen; copied += plen; size -= plen; sgl->cur++; } while (len && sgl->cur < MAX_SGL_ENTS); ctx->merge = plen & (PAGE_SIZE - 1); } if (!size) sg_mark_end(sg + sgl->cur - 1); } err = 0; ctx->more = msg->msg_flags & MSG_MORE; unlock: af_alg_data_wakeup(sk); ctx->write = false; release_sock(sk); return copied ?: err; } EXPORT_SYMBOL_GPL(af_alg_sendmsg); /** * af_alg_free_resources - release resources required for crypto request * @areq: Request holding the TX and RX SGL */ void af_alg_free_resources(struct af_alg_async_req *areq) { struct sock *sk = areq->sk; struct af_alg_ctx *ctx; af_alg_free_areq_sgls(areq); sock_kfree_s(sk, areq, areq->areqlen); ctx = alg_sk(sk)->private; ctx->inflight = false; } EXPORT_SYMBOL_GPL(af_alg_free_resources); /** * af_alg_async_cb - AIO callback handler * @data: async request completion data * @err: if non-zero, error result to be returned via ki_complete(); * otherwise return the AIO output length via ki_complete(). * * This handler cleans up the struct af_alg_async_req upon completion of the * AIO operation. * * The number of bytes to be generated with the AIO operation must be set * in areq->outlen before the AIO callback handler is invoked. */ void af_alg_async_cb(void *data, int err) { struct af_alg_async_req *areq = data; struct sock *sk = areq->sk; struct kiocb *iocb = areq->iocb; unsigned int resultlen; /* Buffer size written by crypto operation. */ resultlen = areq->outlen; af_alg_free_resources(areq); sock_put(sk); iocb->ki_complete(iocb, err ? err : (int)resultlen); } EXPORT_SYMBOL_GPL(af_alg_async_cb); /** * af_alg_poll - poll system call handler * @file: file pointer * @sock: socket to poll * @wait: poll_table */ __poll_t af_alg_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; __poll_t mask; sock_poll_wait(file, sock, wait); mask = 0; if (!ctx->more || ctx->used) mask |= EPOLLIN | EPOLLRDNORM; if (af_alg_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; return mask; } EXPORT_SYMBOL_GPL(af_alg_poll); /** * af_alg_alloc_areq - allocate struct af_alg_async_req * * @sk: socket of connection to user space * @areqlen: size of struct af_alg_async_req + crypto_*_reqsize * Return: allocated data structure or ERR_PTR upon error */ struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen) { struct af_alg_ctx *ctx = alg_sk(sk)->private; struct af_alg_async_req *areq; /* Only one AIO request can be in flight. */ if (ctx->inflight) return ERR_PTR(-EBUSY); areq = sock_kmalloc(sk, areqlen, GFP_KERNEL); if (unlikely(!areq)) return ERR_PTR(-ENOMEM); memset(areq, 0, areqlen); ctx->inflight = true; areq->areqlen = areqlen; areq->sk = sk; areq->first_rsgl.sgl.sgt.sgl = areq->first_rsgl.sgl.sgl; INIT_LIST_HEAD(&areq->rsgl_list); return areq; } EXPORT_SYMBOL_GPL(af_alg_alloc_areq); /** * af_alg_get_rsgl - create the RX SGL for the output data from the crypto * operation * * @sk: socket of connection to user space * @msg: user space message * @flags: flags used to invoke recvmsg with * @areq: instance of the cryptographic request that will hold the RX SGL * @maxsize: maximum number of bytes to be pulled from user space * @outlen: number of bytes in the RX SGL * Return: 0 on success, < 0 upon error */ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, struct af_alg_async_req *areq, size_t maxsize, size_t *outlen) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; size_t len = 0; while (maxsize > len && msg_data_left(msg)) { struct af_alg_rsgl *rsgl; ssize_t err; size_t seglen; /* limit the amount of readable buffers */ if (!af_alg_readable(sk)) break; seglen = min_t(size_t, (maxsize - len), msg_data_left(msg)); if (list_empty(&areq->rsgl_list)) { rsgl = &areq->first_rsgl; } else { rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL); if (unlikely(!rsgl)) return -ENOMEM; } rsgl->sgl.need_unpin = iov_iter_extract_will_pin(&msg->msg_iter); rsgl->sgl.sgt.sgl = rsgl->sgl.sgl; rsgl->sgl.sgt.nents = 0; rsgl->sgl.sgt.orig_nents = 0; list_add_tail(&rsgl->list, &areq->rsgl_list); sg_init_table(rsgl->sgl.sgt.sgl, ALG_MAX_PAGES); err = extract_iter_to_sg(&msg->msg_iter, seglen, &rsgl->sgl.sgt, ALG_MAX_PAGES, 0); if (err < 0) { rsgl->sg_num_bytes = 0; return err; } sg_mark_end(rsgl->sgl.sgt.sgl + rsgl->sgl.sgt.nents - 1); /* chain the new scatterlist with previous one */ if (areq->last_rsgl) af_alg_link_sg(&areq->last_rsgl->sgl, &rsgl->sgl); areq->last_rsgl = rsgl; len += err; atomic_add(err, &ctx->rcvused); rsgl->sg_num_bytes = err; } *outlen = len; return 0; } EXPORT_SYMBOL_GPL(af_alg_get_rsgl); static int __init af_alg_init(void) { int err = proto_register(&alg_proto, 0); if (err) goto out; err = sock_register(&alg_family); if (err != 0) goto out_unregister_proto; out: return err; out_unregister_proto: proto_unregister(&alg_proto); goto out; } static void __exit af_alg_exit(void) { sock_unregister(PF_ALG); proto_unregister(&alg_proto); } module_init(af_alg_init); module_exit(af_alg_exit); MODULE_DESCRIPTION("Crypto userspace interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_ALG);
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0 /* * IEEE 802.15.4 PAN management * * Copyright (C) 2023 Qorvo US, Inc * Authors: * - David Girault <david.girault@qorvo.com> * - Miquel Raynal <miquel.raynal@bootlin.com> */ #include <linux/kernel.h> #include <net/cfg802154.h> #include <net/af_ieee802154.h> /* Checks whether a device address matches one from the PAN list. * This helper is meant to be used only during PAN management, when we expect * extended addresses to be used. */ static bool cfg802154_pan_device_is_matching(struct ieee802154_pan_device *pan_dev, struct ieee802154_addr *ext_dev) { if (!pan_dev || !ext_dev) return false; if (ext_dev->mode == IEEE802154_ADDR_SHORT) return false; return pan_dev->extended_addr == ext_dev->extended_addr; } bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev) { bool is_assoc; mutex_lock(&wpan_dev->association_lock); is_assoc = !list_empty(&wpan_dev->children) || wpan_dev->parent; mutex_unlock(&wpan_dev->association_lock); return is_assoc; } bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { lockdep_assert_held(&wpan_dev->association_lock); return cfg802154_pan_device_is_matching(wpan_dev->parent, target); } EXPORT_SYMBOL_GPL(cfg802154_device_is_parent); struct ieee802154_pan_device * cfg802154_device_is_child(struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { struct ieee802154_pan_device *child; lockdep_assert_held(&wpan_dev->association_lock); list_for_each_entry(child, &wpan_dev->children, node) if (cfg802154_pan_device_is_matching(child, target)) return child; return NULL; } EXPORT_SYMBOL_GPL(cfg802154_device_is_child); __le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev) { struct ieee802154_pan_device *child; __le16 addr; lockdep_assert_held(&wpan_dev->association_lock); do { get_random_bytes(&addr, 2); if (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST) || addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC)) continue; if (wpan_dev->short_addr == addr) continue; if (wpan_dev->parent && wpan_dev->parent->short_addr == addr) continue; list_for_each_entry(child, &wpan_dev->children, node) if (child->short_addr == addr) continue; break; } while (1); return addr; } EXPORT_SYMBOL_GPL(cfg802154_get_free_short_addr); unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, unsigned int max) { unsigned int old_max; lockdep_assert_held(&wpan_dev->association_lock); old_max = wpan_dev->max_associations; wpan_dev->max_associations = max; return old_max; } EXPORT_SYMBOL_GPL(cfg802154_set_max_associations);
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) /** * lockdep_set_novalidate_class: disable checking of lock ordering on a given * lock * @lock: Lock to mark * * Lockdep will still record that this lock has been taken, and print held * instances when dumping locks */ #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /** * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely * @lock: Lock to mark * * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs, * which takes more locks than lockdep is able to track (48). */ #define lockdep_set_notrack_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 #define LOCK_STATE_HELD 1 /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); #define lock_set_novalidate_class(l, n, i) \ lock_set_class(l, n, &__lockdep_no_validate__, 0, i) static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert(cond) \ do { WARN_ON(debug_locks && !(cond)); } while (0) #define lockdep_assert_once(cond) \ do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0) #define lockdep_assert_held(l) \ do { lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD); __assume_ctx_lock(l); } while (0) #define lockdep_assert_not_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD) #define lockdep_assert_held_write(l) \ do { lockdep_assert(lockdep_is_held_type(l, 0)); __assume_ctx_lock(l); } while (0) #define lockdep_assert_held_read(l) \ do { lockdep_assert(lockdep_is_held_type(l, 1)); __assume_shared_ctx_lock(l); } while (0) #define lockdep_assert_held_once(l) \ lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. */ #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map _name = { \ .name = #_name "-wait-type-override", \ .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) # define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) #define lockdep_set_notrack_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) /* * Dummy forward declarations, allow users to write less ifdef-y code * and depend on dead code elimination. */ extern int lock_is_held(const void *); extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert(c) do { } while (0) #define lockdep_assert_once(c) do { } while (0) #define lockdep_assert_held(l) __assume_ctx_lock(l) #define lockdep_assert_not_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) __assume_ctx_lock(l) #define lockdep_assert_held_read(l) __assume_shared_ctx_lock(l) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn); #define lock_set_cmp_fn(lock, ...) lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__) #else #define lock_set_cmp_fn(lock, ...) do { } while (0) #endif enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_no_hardirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) /* * Acceptable for protecting per-CPU resources accessed from BH. * Much like in_softirq() - semantics are ambiguous, use carefully. */ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) # define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
35 5 11 35 35 20 36 36 36 36 20 1 1 1 1 17 18 26 26 36 13 1 32 32 24 2 32 32 32 29 29 28 29 34 1 33 10 10 2 2 3 15 13 8 1 1 1 5 5 5 272 8 265 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc **queues; }; static struct Qdisc * multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif band = skb_get_queue_mapping(skb); if (band >= q->bands) return q->queues[0]; return q->queues[band]; } static int multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; qdisc = multiq_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *multiq_dequeue(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ q->curband++; if (q->curband >= q->bands) q->curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } } } return NULL; } static struct sk_buff *multiq_peek(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned int curband = q->curband; struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ curband++; if (curband >= q->bands) curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) return skb; } } return NULL; } static void multiq_reset(struct Qdisc *sch) { u16 band; struct multiq_sched_data *q = qdisc_priv(sch); for (band = 0; band < q->bands; band++) qdisc_reset(q->queues[band]); q->curband = 0; } static void multiq_destroy(struct Qdisc *sch) { int band; struct multiq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_put(q->queues[band]); kfree(q->queues); } static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; struct Qdisc **removed; int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); qopt->bands = qdisc_dev(sch)->real_num_tx_queues; removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; qdisc_purge_queue(child); removed[n_removed++] = child; } } sch_tree_unlock(sch); for (i = 0; i < n_removed; i++) qdisc_put(removed[i]); kfree(removed); for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (child) { sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; if (child != &noop_qdisc) qdisc_hash_add(child, true); if (old != &noop_qdisc) qdisc_purge_queue(old); sch_tree_unlock(sch); qdisc_put(old); } } } return 0; } static int multiq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); int i, err; q->queues = NULL; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kzalloc_objs(struct Qdisc *, q->max_bands); if (!q->queues) return -ENOBUFS; for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; return multiq_tune(sch, opt, extack); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_multiq_qopt opt; opt.bands = q->bands; opt.max_bands = q->max_bands; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } static struct Qdisc * multiq_leaf(struct Qdisc *sch, unsigned long arg) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long multiq_find(struct Qdisc *sch, u32 classid) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return multiq_find(sch, classid); } static void multiq_unbind(struct Qdisc *q, unsigned long cl) { } static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct multiq_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl - 1]->handle; return 0; } static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct multiq_sched_data *q = qdisc_priv(sch); int band; if (arg->stop) return; for (band = 0; band < q->bands; band++) { if (!tc_qdisc_stats_dump(sch, band + 1, arg)) break; } } static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { .graft = multiq_graft, .leaf = multiq_leaf, .find = multiq_find, .walk = multiq_walk, .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_unbind, .dump = multiq_dump_class, .dump_stats = multiq_dump_class_stats, }; static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &multiq_class_ops, .id = "multiq", .priv_size = sizeof(struct multiq_sched_data), .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, .change = multiq_tune, .dump = multiq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { return register_qdisc(&multiq_qdisc_ops); } static void __exit multiq_module_exit(void) { unregister_qdisc(&multiq_qdisc_ops); } module_init(multiq_module_init) module_exit(multiq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc");
258 840 840 838 840 840 798 267 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; /* * Describes a VMA merge operation and is threaded throughout it. * * Any of the fields may be mutated by the merge operation, so no guarantees are * made to the contents of this structure after a merge operation has completed. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; /* * Adjacent VMAs, any of which may be NULL if not present: * * |------|--------|------| * | prev | middle | next | * |------|--------|------| * * middle may not yet exist in the case of a proposed new VMA being * merged, or it may be an existing VMA. * * next may be assigned by the caller. */ struct vm_area_struct *prev; struct vm_area_struct *middle; struct vm_area_struct *next; /* This is the VMA we ultimately target to become the merged VMA. */ struct vm_area_struct *target; /* * Initially, the start, end, pgoff fields are provided by the caller * and describe the proposed new VMA range, whether modifying an * existing VMA (which will be 'middle'), or adding a new one. * * During the merge process these fields are updated to describe the new * range _including those VMAs which will be merged_. */ unsigned long start; unsigned long end; pgoff_t pgoff; vm_flags_t vm_flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_state state; /* If copied from (i.e. mremap()'d) the VMA from which we are copying. */ struct vm_area_struct *copied_from; /* Flags which callers can use to modify merge behaviour: */ /* * If we can expand, simply do so. We know there is nothing to merge to * the right. Does not reset state upon failure to merge. The VMA * iterator is assumed to be positioned at the previous VMA, rather than * at the gap. */ bool just_expand :1; /* * If a merge is possible, but an OOM error occurs, give up and don't * execute the merge, returning NULL. */ bool give_up_on_oom :1; /* * If set, skip uprobe_mmap upon merged vma. */ bool skip_vma_uprobe :1; /* Internal flags set during merge process: */ /* * Internal flag indicating the merge increases vmg->middle->vm_start * (and thereby, vmg->prev->vm_end). */ bool __adjust_middle_start :1; /* * Internal flag indicating the merge decreases vmg->next->vm_start * (and thereby, vmg->middle->vm_end). */ bool __adjust_next_start :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->middle. */ bool __remove_middle :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->next. */ bool __remove_next :1; }; struct unmap_desc { struct ma_state *mas; /* the maple state point to the first vma */ struct vm_area_struct *first; /* The first vma */ unsigned long pg_start; /* The first pagetable address to free (floor) */ unsigned long pg_end; /* The last pagetable address to free (ceiling) */ unsigned long vma_start; /* The min vma address */ unsigned long vma_end; /* The max vma address */ unsigned long tree_end; /* Maximum for the vma tree search */ unsigned long tree_reset; /* Where to reset the vma tree walk */ bool mm_wr_locked; /* If the mmap write lock is held */ }; /* * unmap_all_init() - Initialize unmap_desc to remove all vmas, point the * pg_start and pg_end to a safe location. */ static inline void unmap_all_init(struct unmap_desc *unmap, struct vma_iterator *vmi, struct vm_area_struct *vma) { unmap->mas = &vmi->mas; unmap->first = vma; unmap->pg_start = FIRST_USER_ADDRESS; unmap->pg_end = USER_PGTABLES_CEILING; unmap->vma_start = 0; unmap->vma_end = ULONG_MAX; unmap->tree_end = ULONG_MAX; unmap->tree_reset = vma->vm_end; unmap->mm_wr_locked = false; } /* * unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within * the user range. * * ARM can have mappings outside of vmas. * See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS") * * ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING * See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h */ static inline void unmap_pgtable_init(struct unmap_desc *unmap, struct vma_iterator *vmi) { vma_iter_set(vmi, unmap->tree_reset); unmap->vma_start = FIRST_USER_ADDRESS; unmap->vma_end = USER_PGTABLES_CEILING; unmap->tree_end = USER_PGTABLES_CEILING; } #define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next) \ struct unmap_desc name = { \ .mas = &(_vmi)->mas, \ .first = _vma, \ .pg_start = _prev ? ((struct vm_area_struct *)_prev)->vm_end : \ FIRST_USER_ADDRESS, \ .pg_end = _next ? ((struct vm_area_struct *)_next)->vm_start : \ USER_PGTABLES_CEILING, \ .vma_start = _vma_start, \ .vma_end = _vma_end, \ .tree_end = _next ? \ ((struct vm_area_struct *)_next)->vm_start : \ USER_PGTABLES_CEILING, \ .tree_reset = _vma->vm_end, \ .mm_wr_locked = true, \ } static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .vm_flags = vm_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .middle = vma_, \ .next = NULL, \ .start = start_, \ .end = end_, \ .vm_flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif __must_check int vma_expand(struct vma_merge_struct *vmg); __must_check int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } /* * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* * Since we're invoking .mmap_prepare() despite having a partially * established VMA, we must take care to handle setting fields * correctly. */ /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; if (desc->vm_file != vma->vm_file) vma_set_file(vma, desc->vm_file); vma->flags = desc->vma_flags; vma->vm_page_prot = desc->page_prot; /* User-defined fields. */ vma->vm_ops = desc->vm_ops; vma->vm_private_data = desc->private_data; } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma); void unmap_region(struct unmap_desc *unmap); /** * vma_modify_flags() - Perform any necessary split/merge in preparation for * setting VMA flags to *@vm_flags in the range @start to @end contained within * @vma. * @vmi: Valid VMA iterator positioned at @vma. * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is * about to be set to. On merge, this will be updated to include sticky flags. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points * to the requested flags which are then updated so the caller, should they * overwrite any existing flags, correctly retains these. * * Returns: A VMA which contains the range @start to @end ready to have its * flags altered to *@vm_flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t *vm_flags_ptr); /** * vma_modify_name() - Perform any necessary split/merge in preparation for * setting anonymous VMA name to @new_name in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @new_name: The anonymous VMA name that the @start to @end range is about to * be set to. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its * anonymous VMA name changed to @new_name. */ __must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct anon_vma_name *new_name); /** * vma_modify_policy() - Perform any necessary split/merge in preparation for * setting NUMA policy to @new_pol in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @new_pol: The NUMA policy that the @start to @end range is about to be set * to. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its * NUMA policy changed to @new_pol. */ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /** * vma_modify_flags_uffd() - Perform any necessary split/merge in preparation for * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range * @start to @end contained within @vma. * @vmi: Valid VMA iterator positioned at @vma. * @prev: The VMA immediately prior to @vma or NULL if @vma is the first. * @vma: The VMA containing the range @start to @end to be updated. * @start: The start of the range to update. May be offset within @vma. * @end: The exclusive end of the range to update, may be offset within @vma. * @vm_flags: The VMA flags that the @start to @end range is about to be set to. * @new_ctx: The userfaultfd context that the @start to @end range is about to * be set to. * @give_up_on_oom: If an out of memory condition occurs on merge, simply give * up on it and treat the merge as best-effort. * * IMPORTANT: The actual modification being requested here is NOT applied, * rather the VMA is perhaps split, perhaps merged to accommodate the change, * and the caller is expected to perform the actual modification. * * Returns: A VMA which contains the range @start to @end ready to have its VMA * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); __must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store_overwrite(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline void vma_iter_store_new(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_mark_attached(vma); vma_iter_store_overwrite(vmi, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } #else static inline bool vma_is_sealed(struct vm_area_struct *vma) { return false; } #endif #if defined(CONFIG_STACK_GROWSUP) int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); /* vma_init.h, shared between CONFIG_MMU and nommu. */ void __init vma_state_init(void); struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); void vm_area_free(struct vm_area_struct *vma); /* vma_exec.c */ #ifdef CONFIG_MMU int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif #endif /* __MM_VMA_H */
165 165 156 142 141 15 13 2 4 1 153 153 155 155 10 10 10 3 8 9 4 4 4 5 5 9 1 1 1 1 1 8 8 5 3 8 7 3 3 3 5 5 5 2 20 17 9 5 4 1 2 22 22 5 14 5 3 4 1 1 10 10 2 1 8 8 47 47 1 2 17 17 9 8 4 13 9 8 20 10 1 14 9 5 14 4 4 8 3 4 2 30 8 21 56 3 6 2 56 50 9 89 89 20 5 3 59 10 28 12 22 40 40 36 3 40 11 46 2 5 49 11 4 6 7 32 49 18 15 34 26 27 6 4 30 40 11 7 7 6 1 1 3 1 2 3 2 1 1 1 1 1 5 1 1 1 5 5 5 1 4 125 125 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), * Pavel Kankovsky (for Linux 2.4.32) * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/icmpv6.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/transp_v6.h> #endif struct ping_table { struct hlist_head hash[PING_HTABLE_SIZE]; spinlock_t lock; }; static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_IPV6_MOD_GPL(pingv6_ops); static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; pr_debug("hash(%u) = %u\n", num, res); return res; } static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } int ping_get_port(struct sock *sk, unsigned short ident) { struct net *net = sock_net(sk); struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { u16 result = net->ipv4.ping_port_rover + 1; u32 i; for (i = 0; i < (1L << 16); i++, result++) { if (!result) continue; /* avoid zero */ hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) continue; isk2 = inet_sk(sk2); if (isk2->inet_num == result) goto next_port; } /* found */ net->ipv4.ping_port_rover = ident = result; break; next_port: ; } if (i >= (1L << 16)) goto fail; } else { hlist = ping_hashslot(&ping_table, net, ident); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) continue; isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c * doesn't turn off SO_REUSEADDR, and it doesn't expect * that other ping processes can steal its packets. */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } pr_debug("found port/ident = %d\n", ident); isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(net, sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } EXPORT_IPV6_MOD_GPL(ping_get_port); void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { WRITE_ONCE(isk->inet_num, 0); isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } spin_unlock(&ping_table.lock); } EXPORT_IPV6_MOD_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { dif = inet_iif(skb); sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { dif = inet6_iif(skb); sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif } else { return NULL; } sk_for_each_rcu(sk, hslot) { int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; isk = inet_sk(sk); pr_debug("iterate\n"); if (READ_ONCE(isk->inet_num) != ident) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { __be32 rcv_saddr = READ_ONCE(isk->inet_rcv_saddr); pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, ident, &rcv_saddr, bound_dev_if); if (rcv_saddr && rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, ident, &sk->sk_v6_rcv_saddr, bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, &ipv6_hdr(skb)->daddr)) continue; #endif } else { continue; } if (bound_dev_if && bound_dev_if != dif && bound_dev_if != sdif) continue; goto exit; } sk = NULL; exit: return sk; } static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; int i; kgid_t low, high; int ret = 0; if (sk->sk_family == AF_INET6) sk->sk_ipv6only = 1; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; group_info = get_current_groups(); for (i = 0; i < group_info->ngroups; i++) { kgid_t gid = group_info->gid[i]; if (gid_lte(low, gid) && gid_lte(gid, high)) goto out_release_group; } ret = -EACCES; out_release_group: put_group_info(group_info); return ret; } EXPORT_IPV6_MOD_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt)); sk_common_release(sk); } EXPORT_IPV6_MOD_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr_unsized *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin_family != AF_INET && !(addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr == htonl(INADDR_ANY))) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) return 0; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST || (chk_addr_ret != RTN_LOCAL && !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; int addr_type, scoped, has_addr; struct net_device *dev = NULL; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); addr_type = ipv6_addr_type(&addr->sin6_addr); scoped = __ipv6_addr_needs_scope_id(addr_type); if ((addr_type != IPV6_ADDR_ANY && !(addr_type & IPV6_ADDR_UNICAST)) || (scoped && !addr->sin6_scope_id)) return -EINVAL; rcu_read_lock(); if (addr->sin6_scope_id) { dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); if (!dev) { rcu_read_unlock(); return -ENODEV; } } if (!dev && sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { rcu_read_unlock(); return -ENODEV; } } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; if (scoped) sk->sk_bound_dev_if = addr->sin6_scope_id; #endif } else { return -EAFNOSUPPORT; } return 0; } static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; isk->inet_saddr = addr->sin_addr.s_addr; WRITE_ONCE(isk->inet_rcv_saddr, addr->sin_addr.s_addr); #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; struct ipv6_pinfo *np = inet6_sk(sk); sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; #endif } } /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; int err; int dif = sk->sk_bound_dev_if; err = ping_check_bind_addr(sk, isk, uaddr, addr_len); if (err) return err; lock_sock(sk); err = -EINVAL; if (isk->inet_num != 0) goto out; err = -EADDRINUSE; snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ sk->sk_bound_dev_if = dif; goto out; } ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, sk->sk_bound_dev_if); err = 0; if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #endif if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); #endif sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } EXPORT_IPV6_MOD_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. */ void ping_err(struct sk_buff *skb, int offset, u32 info) { int family; struct icmphdr *icmph; struct inet_sock *inet_sock; int type; int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; if (skb->protocol == htons(ETH_P_IP)) { family = AF_INET; type = icmp_hdr(skb)->type; code = icmp_hdr(skb)->code; icmph = (struct icmphdr *)(skb->data + offset); } else if (skb->protocol == htons(ETH_P_IPV6)) { family = AF_INET6; type = icmp6_hdr(skb)->icmp6_type; code = icmp6_hdr(skb)->icmp6_code; icmph = (struct icmphdr *) (skb->data + offset); } else { BUG(); } /* We assume the packet has already been checked by icmp_unreach */ if (!ping_supported(family, icmph->type, icmph->code)) return; pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", skb->protocol, type, code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (!sk) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } pr_debug("err on socket %p\n", sk); err = 0; harderr = 0; inet_sock = inet_sk(sk); if (skb->protocol == htons(ETH_P_IP)) { switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: /* This is not a real error but ping wants to see it. * Report it with some fake errno. */ err = EREMOTEIO; break; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); #endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { if (family == AF_INET) { ip_icmp_error(sk, skb, err, 0 /* no remote port */, info, (u8 *)icmph); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, info, (u8 *)icmph); #endif } } sk->sk_err = err; sk_error_report(sk); out: return; } EXPORT_IPV6_MOD_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer * starting from the payload. */ int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = from; if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, &pfh->msg->msg_iter)) return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } #endif return 0; } EXPORT_IPV6_MOD_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); if (!skb) return 0; pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); skb->ip_summed = CHECKSUM_NONE; return ip_push_pending_frames(sk, fl4); } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len) { u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; /* Must have at least a full ICMP header. */ if (len < icmph_len) return -EINVAL; /* * Check the flags. */ /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { type = ((struct icmphdr *) user_icmph)->type; code = ((struct icmphdr *) user_icmph)->code; #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { type = ((struct icmp6hdr *) user_icmph)->icmp6_type; code = ((struct icmp6hdr *) user_icmph)->icmp6_code; #endif } else { BUG(); } if (!ping_supported(family, type, code)) return -EINVAL; return 0; } EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, IP_OPTIONS_DATA_FIXED_SIZE); struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; int free = 0; __be32 saddr, daddr, faddr; u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; /* * Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; /* no remote port */ } ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = opt_copy; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; } scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); pfh.icmph.type = user_icmph.type; /* already checked */ pfh.icmph.code = user_icmph.code; /* ditto */ pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, sizeof(struct icmphdr), &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; struct sk_buff *skb; int copied, err; pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, READ_ONCE(isk->inet_num)); err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return inet_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* Don't bother checking the checksum */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address and add cmsg data. */ if (family == AF_INET) { DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); msg->msg_namelen = sizeof(*sin); } if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { struct ipv6hdr *ip6 = ipv6_hdr(skb); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ip6->saddr; sin6->sin6_flowinfo = 0; if (inet6_test_bit(SNDFLOW, sk)) sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); msg->msg_namelen = sizeof(*sin6); } if (inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); else if (skb->protocol == htons(ETH_P_IP) && inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { BUG(); } err = copied; done: skb_free_datagram(sk, skb); out: pr_debug("ping_recvmsg -> %d\n", err); return err; } EXPORT_IPV6_MOD_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } return SKB_NOT_DROPPED_YET; } int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb); /* * All we need to do is get the socket. */ enum skb_drop_reason ping_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); struct sock *sk; /* We assume the packet has already been checked by icmp_rcv */ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk) return __ping_queue_rcv_skb(sk, skb); kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); return SKB_DROP_REASON_NO_SOCKET; } EXPORT_IPV6_MOD_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; EXPORT_IPV6_MOD(ping_prot); #ifdef CONFIG_PROC_FS static struct sock *ping_get_first(struct seq_file *seq, int start) { struct sock *sk; struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { struct hlist_head *hslot; hslot = &ping_table.hash[state->bucket]; if (hlist_empty(hslot)) continue; sk_for_each(sk, hslot) { if (net_eq(sock_net(sk), net) && sk->sk_family == state->family) goto found; } } sk = NULL; found: return sk; } static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) { struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); do { sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) return ping_get_first(seq, state->bucket + 1); return sk; } static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = ping_get_first(seq, 0); if (sk) while (pos && (sk = ping_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_IPV6_MOD_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET); } void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = ping_get_idx(seq, 0); else sk = ping_get_next(seq, v); ++*pos; return sk; } EXPORT_IPV6_MOD_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } EXPORT_IPV6_MOD_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; ping_v4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations ping_v4_seq_ops = { .start = ping_v4_seq_start, .show = ping_v4_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v4_proc_init_net(struct net *net) { if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; net->ipv4.ping_port_rover = get_random_u16(); return 0; } static void __net_exit ping_v4_proc_exit_net(struct net *net) { remove_proc_entry("icmp", net->proc_net); } static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { unregister_pernet_subsys(&ping_v4_net_ops); } #endif void __init ping_init(void) { int i; for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_HEAD(&ping_table.hash[i]); spin_lock_init(&ping_table.lock); }
43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal AFS stuff * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> #include <crypto/krb5.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" #define AFS_CELL_MAX_ADDRS 15 struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only * allows whole-file locks and no upgrading/downgrading). */ enum afs_flock_mode { afs_flock_mode_unset, afs_flock_mode_local, /* Local locking only */ afs_flock_mode_openafs, /* Don't get server lock for a partial lock */ afs_flock_mode_strict, /* Always get a server lock for a partial lock */ afs_flock_mode_write, /* Get an exclusive server lock for a partial lock */ }; struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ bool no_cell; /* T if the source is "none" (for dynroot) */ enum afs_flock_mode flock_mode; /* Partial file-locking emulation mode */ afs_voltype_t type; /* type of volume requested */ unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; enum afs_call_state { AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ AFS_CALL_SV_REPLYING, /* Server: Replying */ AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ AFS_CALL_COMPLETE, /* Completed or failed */ }; /* * Address preferences. */ struct afs_addr_preference { union { struct in_addr ipv4_addr; /* AF_INET address to compare against */ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ }; sa_family_t family; /* Which address to use */ u16 prio; /* Priority */ u8 subnet_mask; /* How many bits to compare */ }; struct afs_addr_preference_list { struct rcu_head rcu; u16 version; /* Incremented when prefs list changes */ u8 ipv6_off; /* Offset of IPv6 addresses */ u8 nr; /* Number of addresses in total */ u8 max_prefs; /* Number of prefs allocated */ struct afs_addr_preference prefs[] __counted_by(max_prefs); }; struct afs_address { struct rxrpc_peer *peer; short last_error; /* Last error from this address */ u16 prio; /* Address priority */ }; /* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned int debug_id; unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; void *buffer; /* reply receive buffer */ union { struct afs_endpoint_state *probe; struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned long long remaining; /* How much is left to receive */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ struct { __be32 tmp_u; __be32 tmp; } __attribute__((packed)); __be64 tmp64; }; ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { const char *name; unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ int (*deliver)(struct afs_call *call); /* clean up a call */ void (*destructor)(struct afs_call *call); /* Async receive processing function */ void (*async_rx)(struct work_struct *work); /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); /* Handle a call being immediately cancelled. */ void (*immediate_cancel)(struct afs_call *call); }; /* * Key available for writeback on a file. */ struct afs_wb_key { refcount_t usage; struct key *key; struct list_head vnode_link; /* Link in vnode->wb_keys */ }; /* * AFS open file information record. Pointed to by file->private_data. */ struct afs_file { struct key *key; /* The key this file was opened with */ struct afs_wb_key *wb; /* Writeback key record for this file */ }; static inline struct key *afs_file_key(struct file *file) { struct afs_file *af = file->private_data; return af->key; } /* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { struct net *net_ns; /* Network namespace */ struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ enum afs_flock_mode flock_mode:8; /* File locking emulation mode */ bool dyn_root; /* True if dynamic root */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) { return sb->s_fs_info; } extern struct file_system_type afs_fs_type; /* * Set of substitutes for @sys. */ struct afs_sysnames { #define AFS_NR_SYSNAME 16 char *subs[AFS_NR_SYSNAME]; refcount_t usage; unsigned short nr; char blank[1]; }; /* * AFS network namespace record. */ struct afs_net { struct net *net; /* Backpointer to the owning net namespace */ struct afs_uuid uuid; bool live; /* F if this namespace is being removed */ /* AF_RXRPC I/O stuff */ struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ struct afs_cell __rcu *ws_cell; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; /* File locking renewal management */ struct mutex lock_manager_mutex; /* Misc */ struct super_block *dynroot_sb; /* Dynamic root mount superblock */ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; struct afs_addr_preference_list __rcu *address_prefs; u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ atomic_t n_reval; /* Number of dentries needing revalidation */ atomic_t n_inval; /* Number of invalidations by the server */ atomic_t n_relpg; /* Number of invalidations by release_folio */ atomic_t n_read_dir; /* Number of directory pages read */ atomic_t n_dir_cr; /* Number of directory entry creation edits */ atomic_t n_dir_rm; /* Number of directory entry removal edits */ atomic_t n_stores; /* Number of store ops */ atomic_long_t n_store_bytes; /* Number of bytes stored */ atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ atomic_t n_fetches; /* Number of data fetch ops */ }; extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, }; /* * AFS cell record. * * This is a tricky concept to get right as it is possible to create aliases * simply by pointing AFSDB/SRV records for two names at the same set of VL * servers; it is also possible to do things like setting up two sets of VL * servers, one of which provides a superset of the volumes provided by the * other (for internal/external division, for example). * * Cells only exist in the sense that (a) a cell's name maps to a set of VL * servers and (b) a cell's name is used by the client to select the key to use * for authentication and encryption. The cell name is not typically used in * the protocol. * * Two cells are determined to be aliases if they have an explicit alias (YFS * only), share any VL servers in common or have at least one volume in common. * "In common" means that the address list of the VL servers or the fileservers * share at least one endpoint. */ struct afs_cell { union { struct rcu_head rcu; struct rb_node net_node; /* Node in net->cells */ }; struct afs_net *net; struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct destroyer; /* Destroyer for cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct timer_list management_timer; /* General management timer */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ refcount_t ref; /* Struct refcount */ atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ #define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ struct rw_semaphore fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ struct afs_vlserver_list __rcu *vl_servers; u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ char *key_desc; /* Authentication key description */ }; /* * Volume Location server record. */ struct afs_vlserver { struct rcu_head rcu; struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ unsigned long flags; #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ #define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; #define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ #define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ #define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ }; /* * Weighted list of Volume Location servers. */ struct afs_vlserver_entry { u16 priority; /* Preference (as SRV) */ u16 weight; /* Weight (as SRV) */ enum dns_record_source source:8; enum dns_lookup_status status:8; struct afs_vlserver *server; }; struct afs_vlserver_list { struct rcu_head rcu; refcount_t ref; u8 nr_servers; u8 index; /* Server currently in use */ u8 preferred; /* Preferred server */ enum dns_record_source source:8; enum dns_lookup_status status:8; rwlock_t lock; struct afs_vlserver_entry servers[]; }; /* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. */ struct afs_vldb_entry { afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ unsigned long flags; #define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ #define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ #define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ #define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ #define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ uuid_t fs_server[AFS_NMAXNSERVERS]; u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* * Fileserver endpoint state. The records the addresses of a fileserver's * endpoints and the state and result of a round of probing on them. This * allows the rotation algorithm to access those results without them being * erased by a subsequent round of probing. */ struct afs_endpoint_state { struct rcu_head rcu; struct afs_addr_list *addresses; /* The addresses being probed */ unsigned long responsive_set; /* Bitset of responsive endpoints */ unsigned long failed_set; /* Bitset of endpoints we failed to probe */ refcount_t ref; unsigned int server_id; /* Debug ID of server */ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ atomic_t nr_probing; /* Number of outstanding probes */ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ s32 abort_code; short error; unsigned long flags; #define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ #define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ #define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ #define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ #define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ }; /* * Record of fileserver with which we're actively communicating. */ struct afs_server { struct rcu_head rcu; union { uuid_t uuid; /* Server ID */ struct afs_uuid _uuid; }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in cell->fs_servers */ struct list_head probe_link; /* Link in net->fs_probe_* */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct work_struct destroyer; /* Work item to try and destroy a server */ struct timer_list timer; /* Management timer */ struct mutex cm_token_lock; /* Lock governing creation of appdata */ struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ #define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ #define AFS_SERVER_FL_CREATING 4 /* The record is being created */ #define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ #define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ #define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ short create_error; /* Creation error */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ /* Probe state */ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; }; enum afs_ro_replicating { AFS_RO_NOT_REPLICATING, /* Not doing replication */ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ } __mode(byte); /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; struct afs_volume *volume; struct list_head slink; /* Link in server->volumes */ time64_t cb_expires_at; /* Time at which volume-level callback expires */ unsigned long flags; #define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ #define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ #define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; refcount_t usage; bool attached; /* T if attached to servers */ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; struct afs_server_entry servers[]; }; /* * Live AFS volume management. */ struct afs_volume { struct rcu_head rcu; afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; unsigned int debug_id; /* Debugging ID for traces */ time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ #define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ #define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ /* RO release tracking */ struct mutex volsync_lock; /* Time/state evaluation lock */ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ time64_t update_time; /* Volume update time (or TIME64_MIN) */ /* Callback management */ struct mutex cb_check_lock; /* Lock to control race to check after v_break */ time64_t cb_expires_at; /* Earliest volume callback expiry time */ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ atomic_t cb_v_break; /* Volume-break event counter. */ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; struct rw_semaphore open_mmaps_lock; struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; enum afs_lock_state { AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ AFS_VNODE_LOCK_DELETED, /* The vnode has been deleted whilst we have a lock */ }; /* * AFS inode private data. * * Note that afs_alloc_inode() *must* reset anything that could incorrectly * leak from one inode to another. */ struct afs_vnode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; #define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ #define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ struct folio_queue *directory; /* Directory contents */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ struct key *lock_key; /* Key to be used in lock ops */ ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ atomic64_t cb_expires_at; /* time at which callback expires */ #define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif } static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; if (cookie) mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } /* * cached security record for one user's attempt to access a vnode */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ afs_access_t access; /* CallerAccess value for this key */ }; /* * Immutable cache of CallerAccess records from attempts to access vnodes. * These may be shared between multiple vnodes. */ struct afs_permits { struct rcu_head rcu; struct hlist_node hash_node; /* Link in hash */ unsigned long h; /* Hash value for this permit list */ refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* * Error prioritisation and accumulation. */ struct afs_error { s32 abort_code; /* Cumulative abort code */ short error; /* Cumulative error */ bool responded; /* T if server responded */ bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ struct afs_error cumul_error; /* Cumulative error */ unsigned int debug_id; s32 call_abort_code; short call_error; /* Error from single call */ short server_index; /* Current server */ signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ short nr_iterations; /* Number of server iterations */ bool call_responded; /* T if the current address responded */ }; /* * Fileserver state tracking for an operation. An array of these is kept, * indexed by server index. */ struct afs_server_state { /* Tracking of fileserver probe state. Other operations may interfere * by probing a fileserver when accessing other volumes. */ unsigned int probe_seq; unsigned long untried_addrs; /* Addresses we haven't tried yet */ struct wait_queue_entry probe_waiter; struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* * Fileserver operation methods. */ struct afs_operation_ops { void (*issue_afs_rpc)(struct afs_operation *op); void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; struct afs_vnode_param { struct afs_vnode *vnode; struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ bool modification:1; /* Set if the content gets modified */ }; /* * Fileserver operation wrapper, handling server and address rotation * asynchronously. May make simultaneous calls to multiple servers. */ struct afs_operation { struct afs_net *net; /* Network namespace */ struct key *key; /* Key for the cell */ const struct afs_call_type *type; /* Type of call done */ const struct afs_operation_ops *ops; /* Parameters/results for the operation */ struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; struct afs_volsync pre_volsync; /* Volsync before op */ struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ union { struct { int which; /* Which ->file[] to fetch for */ } fetch_status; struct { int reason; /* enum afs_edit_dir_reason */ mode_t mode; const char *symlink; } create; struct { bool need_rehash; } unlink; struct { struct dentry *rehash; struct dentry *tmp; unsigned int rename_flags; bool new_negative; } rename; struct { struct netfs_io_subrequest *subreq; } fetch; struct { afs_lock_type_t type; } lock; struct { struct iov_iter *write_iter; loff_t pos; loff_t size; loff_t i_size; } store; struct { struct iattr *attr; loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; struct { struct afs_volume_status vs; struct kstatfs *buf; } volstatus; }; /* Fileserver iteration state */ struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ s32 call_abort_code; /* Abort code from single call */ short call_error; /* Error from single call */ short server_index; /* Current server */ short nr_iterations; /* Number of server iterations */ signed char addr_index; /* Current address */ bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ #define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ #define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ #define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ #define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ #define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ #define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ #define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ }; /* * Cache auxiliary data. */ struct afs_vnode_cache_aux { __be64 data_version; } __packed; static inline void afs_set_cache_aux(struct afs_vnode *vnode, struct afs_vnode_cache_aux *aux) { aux->data_version = cpu_to_be64(vnode->status.data_version); } static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) { struct afs_vnode_cache_aux aux; afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, i_size_read(&vnode->netfs.inode), flags); } /* * Directory iteration management. */ struct afs_dir_iter { struct afs_vnode *dvnode; union afs_xdr_dir_block *block; struct folio_queue *fq; unsigned int fpos; int fq_slot; unsigned int loop_check; u8 nr_slots; u8 bucket; unsigned int prev_entry; }; #include <trace/events/afs.h> /*****************************************************************************/ /* * addr_list.c */ struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); bool afs_addr_list_same(const struct afs_addr_list *a, const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); void afs_set_peer_appdata(struct afs_server *server, struct afs_addr_list *old_alist, struct afs_addr_list *new_alist); /* * addr_prefs.c */ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_ro_snapshot) + atomic_read(&vnode->volume->cb_scrub)); } /* * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); enum afs_lookup_cell_for { AFS_LOOKUP_CELL_DYNROOT, AFS_LOOKUP_CELL_MOUNTPOINT, AFS_LOOKUP_CELL_DIRECT_MOUNT, AFS_LOOKUP_CELL_PRELOAD, AFS_LOOKUP_CELL_ROOTCELL, AFS_LOOKUP_CELL_ALIAS_CHECK, }; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, const char *vllist, enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c */ extern bool afs_cm_incoming_call(struct afs_call *); /* * cm_security.c */ void afs_process_oob_queue(struct work_struct *work); #ifdef CONFIG_RXGK int afs_create_token_key(struct afs_net *net, struct socket *socket); #else static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) { return 0; } #endif /* * dir.c */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); int afs_single_writepages(struct address_space *mapping, struct writeback_control *wbc); /* * dir_edit.c */ extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); /* * dir_search.c */ unsigned int afs_dir_hash_name(const struct qstr *name); bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, struct afs_fid *_fid); int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* * dir_silly.c */ extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *, struct dentry *, struct key *); extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c */ extern const struct address_space_operations afs_file_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; extern const struct afs_operation_ops afs_fetch_data_operation; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); void afs_fetch_data_async_rx(struct work_struct *work); void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c */ extern struct workqueue_struct *afs_lock_manager; extern void afs_lock_op_done(struct afs_call *); extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ extern void afs_fs_fetch_status(struct afs_operation *); extern void afs_fs_fetch_data(struct afs_operation *); extern void afs_fs_create_file(struct afs_operation *); extern void afs_fs_make_dir(struct afs_operation *); extern void afs_fs_remove_file(struct afs_operation *); extern void afs_fs_remove_dir(struct afs_operation *); extern void afs_fs_link(struct afs_operation *); extern void afs_fs_symlink(struct afs_operation *); extern void afs_fs_rename(struct afs_operation *); extern void afs_fs_store_data(struct afs_operation *); extern void afs_fs_setattr(struct afs_operation *); extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, struct afs_address *addr, struct key *key); bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_endpoint_state *estate, unsigned int addr_index, struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); extern void afs_fs_store_acl(struct afs_operation *); /* * fs_operation.c */ extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_end_vnode_operation(struct afs_operation *op); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { op->file[n].vnode = vnode; op->file[n].need_io_lock = true; } static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, const struct afs_fid *fid) { op->file[n].fid = *fid; } /* * fs_probe.c */ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, struct afs_addr_list *new_alist, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ extern const struct afs_operation_ops afs_fetch_status_operation; void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); const char *afs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback); int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); /* * main.c */ extern struct workqueue_struct *afs_wq; extern int afs_net_id; static inline struct afs_net *afs_net(struct net *net) { return net_generic(net, afs_net_id); } static inline struct afs_net *afs_sb2net(struct super_block *sb) { return afs_net(AFS_FS_S(sb)->net_ns); } static inline struct afs_net *afs_d2net(struct dentry *dentry) { return afs_sb2net(dentry->d_sb); } static inline struct afs_net *afs_i2net(struct inode *inode) { return afs_sb2net(inode->i_sb); } static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) { return net_generic(sock_net(sk), afs_net_id); } static inline void __afs_stat(atomic_t *s) { atomic_inc(s); } #define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) /* * misc.c */ extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); static inline void afs_op_nomem(struct afs_operation *op) { op->cumul_error.error = -ENOMEM; } static inline int afs_op_error(const struct afs_operation *op) { return op->cumul_error.error; } static inline s32 afs_op_abort_code(const struct afs_operation *op) { return op->cumul_error.abort_code; } static inline int afs_op_set_error(struct afs_operation *op, int error) { return op->cumul_error.error = error; } static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) { afs_prioritise_error(&op->cumul_error, error, abort_code); } /* * mntpt.c */ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); extern void afs_mntpt_kill_timer(void); /* * proc.c */ #ifdef CONFIG_PROC_FS extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_cell *); extern void afs_proc_cell_remove(struct afs_cell *); extern void afs_put_sysnames(struct afs_sysnames *); #else static inline int afs_proc_init(struct afs_net *net) { return 0; } static inline void afs_proc_cleanup(struct afs_net *net) {} static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } static inline void afs_proc_cell_remove(struct afs_cell *cell) {} static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} #endif /* * rotate.c */ void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c */ extern struct workqueue_struct *afs_async_calls; extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_deliver_to_call(struct afs_call *call); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline struct afs_call *afs_get_call(struct afs_call *call, enum afs_call_trace why) { int r; __refcount_inc(&call->ref, &r); trace_afs_call(call->debug_id, why, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); return call; } static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) { int r = refcount_read(&call->ref); trace_afs_call(call->debug_id, why, r, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); } static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { struct afs_addr_list *alist = op->estate->addresses; op->call = call; op->type = call->type; call->op = op; call->key = op->key; call->intr = !(op->flags & AFS_OPERATION_UNINTR); call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); call->service_id = op->server->service_id; afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) { call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { call->iov_len = size; afs_extract_begin(call, call->buffer, size); } static inline int afs_transfer_reply(struct afs_call *call) { return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, enum afs_call_state state) { return READ_ONCE(call->state) == state; } static inline bool afs_set_call_state(struct afs_call *call, enum afs_call_state from, enum afs_call_state to) { bool ok = false; spin_lock_bh(&call->state_lock); if (call->state == from) { call->state = to; trace_afs_call_state(call, from, to, 0, 0); ok = true; } spin_unlock_bh(&call->state_lock); return ok; } static inline void afs_set_call_complete(struct afs_call *call, int error, u32 remote_abort) { enum afs_call_state state; bool ok = false; spin_lock_bh(&call->state_lock); state = call->state; if (state != AFS_CALL_COMPLETE) { call->abort_code = remote_abort; call->error = error; call->state = AFS_CALL_COMPLETE; trace_afs_call_state(call, state, AFS_CALL_COMPLETE, error, remote_abort); ok = true; } spin_unlock_bh(&call->state_lock); if (ok) { trace_afs_call_done(call); /* Asynchronous calls have two refs to release - one from the alloc and * one queued with the work item - and we can't just deallocate the * call because the work item may be queued again. */ if (call->drop_ref) afs_put_call(call); } } /* * security.c */ extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, struct afs_status_cb *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; struct afs_server *afs_find_server(const struct rxrpc_peer *peer); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); struct afs_server *afs_use_server(struct afs_server *server, bool activate, enum afs_server_trace reason); void afs_unuse_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason); void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); void afs_purge_servers(struct afs_cell *cell); extern void afs_fs_probe_timer(struct timer_list *); void __net_exit afs_wait_for_servers(struct afs_net *net); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) { int r = refcount_read(&server->ref); int a = atomic_read(&server->active); trace_afs_server(server->debug_id, r, a, trace); } static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); } static inline void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) wake_up_var(&net->servers_outstanding); } static inline bool afs_is_probing_server(struct afs_server *server) { return list_empty(&server->probe_link); } /* * server_list.c */ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) { refcount_inc(&slist->usage); return slist; } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, struct afs_server_list *old); void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c */ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* * validation.c */ bool afs_check_validity(const struct afs_vnode *vnode); int afs_update_volume_state(struct afs_operation *op); int afs_validate(struct afs_vnode *vnode, struct key *key); /* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); struct afs_call *afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_list *alist, unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* * vl_alias.c */ extern int afs_cell_detect_alias(struct afs_cell *, struct key *); /* * vl_probe.c */ extern void afs_vlserver_probe_result(struct afs_call *); extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); /* * vl_rotate.c */ extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, struct afs_cell *, struct key *); extern bool afs_select_vlserver(struct afs_vl_cursor *); extern bool afs_select_current_vlserver(struct afs_vl_cursor *); extern int afs_end_vlserver_operation(struct afs_vl_cursor *); /* * vlserver_list.c */ static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) { refcount_inc(&vlserver->ref); return vlserver; } static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) { if (vllist) refcount_inc(&vllist->ref); return vllist; } extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, const void *, size_t); /* * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ void afs_prepare_write(struct netfs_io_subrequest *subreq); void afs_issue_write(struct netfs_io_subrequest *subreq); void afs_begin_writeback(struct netfs_io_request *wreq); void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); /* * xattr.c */ extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c */ extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); extern void yfs_fs_remove_file2(struct afs_operation *); extern void yfs_fs_remove_file(struct afs_operation *); extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); void yfs_fs_rename_replace(struct afs_operation *op); void yfs_fs_rename_noreplace(struct afs_operation *op); void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); extern void yfs_fs_set_lock(struct afs_operation *); extern void yfs_fs_extend_lock(struct afs_operation *); extern void yfs_fs_release_lock(struct afs_operation *); extern void yfs_fs_fetch_status(struct afs_operation *); extern void yfs_fs_inline_bulk_status(struct afs_operation *); struct yfs_acl { struct afs_acl *acl; /* Dir/file/symlink ACL */ struct afs_acl *vol_acl; /* Whole volume ACL */ u32 inherit_flag; /* True if ACL is inherited from parent dir */ u32 num_cleaned; /* Number of ACEs removed due to subject removal */ unsigned int flags; #define YFS_ACL_WANT_ACL 0x01 /* Set if caller wants ->acl */ #define YFS_ACL_WANT_VOL_ACL 0x02 /* Set if caller wants ->vol_acl */ }; extern void yfs_free_opaque_acl(struct yfs_acl *); extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); extern void yfs_fs_store_opaque_acl2(struct afs_operation *); /* * Miscellaneous inline functions. */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { return &vnode->netfs.inode; } /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if * d_fsdata goes backwards as we'll just revalidate. */ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } /* * Set the file size and block count. Estimate the number of 512 bytes blocks * used, rounded up to nearest 1K for consistency with other AFS clients. */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { i_size_write(&vnode->netfs.inode, size); vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). */ static inline void afs_check_dir_conflict(struct afs_operation *op, struct afs_vnode_param *dvp) { if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) op->flags |= AFS_OPERATION_DIR_CONFLICT; } static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); return -EIO; } static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) { trace_afs_file_error(vnode, -EIO, where); return -EIO; } /* * Set the callback promise on a vnode. */ static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, enum afs_cb_promise_trace trace) { atomic64_set(&vnode->cb_expires_at, expires_at); trace_afs_cb_promise(vnode, trace); } /* * Clear the callback promise on a vnode, returning true if it was promised. */ static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, enum afs_cb_promise_trace trace) { trace_afs_cb_promise(vnode, trace); return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; } /* * Mark a directory as being invalid. */ static inline void afs_invalidate_dir(struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_dir_invalid(dvnode, trace); afs_stat_v(dvnode, n_inval); } } /*****************************************************************************/ /* * debug tracing */ extern unsigned afs_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AFS_DEBUG) #define AFS_DEBUG_KENTER 0x01 #define AFS_DEBUG_KLEAVE 0x02 #define AFS_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ BUG(); \ } \ } while(0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #else #define ASSERT(X) \ do { \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ } while(0) #define ASSERTIF(C, X) \ do { \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while(0) #endif /* __KDEBUGALL */
438 428 78 349 422 5 422 5 438 439 440 425 199 66 133 133 198 198 8 198 194 186 162 26 185 185 426 426 427 3 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 1 426 425 45 424 426 7 28 196 158 108 424 426 425 29 424 13 424 58 452 77 490 491 440 426 186 29 249 31 249 46 45 79 79 43 79 1 79 27 76 26 79 40 43 39 39 14 25 39 39 1 1 1 1 1 1 390 390 32 442 459 370 433 458 357 4 358 358 357 1 3 2 4 4 4 4 4 27 27 27 27 2 27 428 25 1 1 115 2 25 89 26 89 115 197 197 198 197 197 198 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This module provides the abstraction for an SCTP association. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ryan Layer <rmlayer@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/in.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal functions. */ static void sctp_select_active_and_retran_path(struct sctp_association *asoc); static void sctp_assoc_bh_rcv(struct work_struct *work); static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc); static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc); /* 1st Level Abstractions. */ /* Initialize a new association from provided memory. */ static struct sctp_association *sctp_association_init( struct sctp_association *asoc, const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp) { struct sctp_sock *sp; struct sctp_paramhdr *p; int i; /* Retrieve the SCTP per socket area. */ sp = sctp_sk((struct sock *)sk); /* Discarding const is appropriate here. */ asoc->ep = (struct sctp_endpoint *)ep; asoc->base.sk = (struct sock *)sk; asoc->base.net = sock_net(sk); sctp_endpoint_hold(asoc->ep); sock_hold(asoc->base.sk); /* Initialize the common base substructure. */ asoc->base.type = SCTP_EP_TYPE_ASSOCIATION; /* Initialize the object handling fields. */ refcount_set(&asoc->base.refcnt, 1); /* Initialize the bind addr area. */ sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); asoc->state = SCTP_STATE_CLOSED; asoc->cookie_life = ms_to_ktime(sp->assocparams.sasoc_cookie_life); asoc->user_frag = sp->user_frag; /* Set the association max_retrans and RTO values from the * socket values. */ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; asoc->pf_retrans = sp->pf_retrans; asoc->ps_retrans = sp->ps_retrans; asoc->pf_expose = sp->pf_expose; asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min); /* Initialize the association's heartbeat interval based on the * sock configured value. */ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); asoc->probe_interval = msecs_to_jiffies(sp->probe_interval); asoc->encap_port = sp->encap_port; /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; asoc->flowlabel = sp->flowlabel; asoc->dscp = sp->dscp; /* Set association default SACK delay */ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); asoc->sackfreq = sp->sackfreq; /* Set the association default flags controlling * Heartbeat, SACK delay, and Path MTU Discovery. */ asoc->param_flags = sp->param_flags; /* Initialize the maximum number of new data packets that can be sent * in a burst. */ asoc->max_burst = sp->max_burst; asoc->subscribe = sp->subscribe; /* initialize association timers */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; /* sctpimpguide Section 2.12.2 * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the * recommended value of 5 times 'RTO.Max'. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) timer_setup(&asoc->timers[i], sctp_timer_events[i], 0); /* Pull default initialization values from the sock options. * Note: This assumes that the values have already been * validated in the sock. */ asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams; asoc->c.sinit_num_ostreams = sp->initmsg.sinit_num_ostreams; asoc->max_init_attempts = sp->initmsg.sinit_max_attempts; asoc->max_init_timeo = msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo); /* Set the local window size for receive. * This is also the rcvbuf space per association. * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; /* Use my own max window until I learn something better. */ asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW; /* Initialize the receive memory counter */ atomic_set(&asoc->rmem_alloc, 0); init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); asoc->c.my_port = ep->base.bind_addr.port; asoc->c.initial_tsn = sctp_generate_tsn(ep); asoc->next_tsn = asoc->c.initial_tsn; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; asoc->highest_sacked = asoc->ctsn_ack_point; asoc->last_cwr_tsn = asoc->ctsn_ack_point; /* ADDIP Section 4.1 Asconf Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) a serial number should be assigned to the chunk. The serial * number SHOULD be a monotonically increasing number. The serial * numbers SHOULD be initialized at the start of the * association to the same value as the initial TSN. */ asoc->addip_serial = asoc->c.initial_tsn; asoc->strreset_outseq = asoc->c.initial_tsn; INIT_LIST_HEAD(&asoc->addip_chunk_list); INIT_LIST_HEAD(&asoc->asconf_ack_list); /* Make an empty list of remote transport addresses. */ INIT_LIST_HEAD(&asoc->peer.transport_addr_list); /* RFC 2960 5.1 Normal Establishment of an Association * * After the reception of the first data chunk in an * association the endpoint must immediately respond with a * sack to acknowledge the data chunk. Subsequent * acknowledgements should be done as described in Section * 6.2. * * [We implement this by telling a new association that it * already received one packet.] */ asoc->peer.sack_needed = 1; asoc->peer.sack_generation = 1; /* Create an input queue. */ sctp_inq_init(&asoc->base.inqueue); sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv); /* Create an output queue. */ sctp_outq_init(asoc, &asoc->outqueue); sctp_ulpq_init(&asoc->ulpq, asoc); if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) goto stream_free; /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; sctp_assoc_update_frag_point(asoc); /* Assume that peer would support both address types unless we are * told otherwise. */ asoc->peer.ipv4_address = 1; if (asoc->base.sk->sk_family == PF_INET6) asoc->peer.ipv6_address = 1; INIT_LIST_HEAD(&asoc->asocs); asoc->default_stream = sp->default_stream; asoc->default_ppid = sp->default_ppid; asoc->default_flags = sp->default_flags; asoc->default_context = sp->default_context; asoc->default_timetolive = sp->default_timetolive; asoc->default_rcv_context = sp->default_rcv_context; /* AUTH related initializations */ INIT_LIST_HEAD(&asoc->endpoint_shared_keys); if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp)) goto stream_free; asoc->active_key_id = ep->active_key_id; asoc->strreset_enable = ep->strreset_enable; /* Save the hmacs and chunks list into this association */ if (ep->auth_hmacs_list) memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list, ntohs(ep->auth_hmacs_list->param_hdr.length)); if (ep->auth_chunk_list) memcpy(asoc->c.auth_chunks, ep->auth_chunk_list, ntohs(ep->auth_chunk_list->param_hdr.length)); /* Get the AUTH random number for this association */ p = (struct sctp_paramhdr *)asoc->c.auth_random; p->type = SCTP_PARAM_RANDOM; p->length = htons(sizeof(*p) + SCTP_AUTH_RANDOM_LENGTH); get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH); return asoc; stream_free: sctp_stream_free(&asoc->stream); sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); return NULL; } /* Allocate and initialize a new association */ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp) { struct sctp_association *asoc; asoc = kzalloc_obj(*asoc, gfp); if (!asoc) goto fail; if (!sctp_association_init(asoc, ep, sk, scope, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(assoc); pr_debug("Created asoc %p\n", asoc); return asoc; fail_init: kfree(asoc); fail: return NULL; } /* Free this association if possible. There may still be users, so * the actual deallocation may be delayed. */ void sctp_association_free(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; struct sctp_transport *transport; struct list_head *pos, *temp; int i; /* Only real associations count against the endpoint, so * don't bother for if this is a temporary association. */ if (!list_empty(&asoc->asocs)) { list_del(&asoc->asocs); /* Decrement the backlog value for a TCP-style listening * socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) sk_acceptq_removed(sk); } /* Mark as dead, so other users can know this structure is * going away. */ asoc->base.dead = true; /* Dispose of any data lying around in the outqueue. */ sctp_outq_free(&asoc->outqueue); /* Dispose of any pending messages for the upper layer. */ sctp_ulpq_free(&asoc->ulpq); /* Dispose of any pending chunks on the inqueue. */ sctp_inq_free(&asoc->base.inqueue); sctp_tsnmap_free(&asoc->peer.tsn_map); /* Free stream information. */ sctp_stream_free(&asoc->stream); if (asoc->strreset_chunk) sctp_chunk_free(asoc->strreset_chunk); /* Clean up the bound address list. */ sctp_bind_addr_free(&asoc->base.bind_addr); /* Do we need to go through all of our timers and * delete them? To be safe we will try to delete all, but we * should be able to go through and make a guess based * on our state. */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { if (timer_delete(&asoc->timers[i])) sctp_association_put(asoc); } /* Free peer's cached cookie. */ kfree(asoc->peer.cookie); kfree(asoc->peer.peer_random); kfree(asoc->peer.peer_chunks); kfree(asoc->peer.peer_hmacs); /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); list_del_rcu(pos); sctp_unhash_transport(transport); sctp_transport_free(transport); } asoc->peer.transport_count = 0; sctp_asconf_queue_teardown(asoc); /* Free pending address space being deleted */ kfree(asoc->asconf_addr_del_pending); /* AUTH - Free the endpoint shared keys */ sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); /* AUTH - Free the association shared key */ sctp_auth_key_put(asoc->asoc_shared_key); sctp_association_put(asoc); } /* Cleanup and free up an association. */ static void sctp_association_destroy(struct sctp_association *asoc) { if (unlikely(!asoc->base.dead)) { WARN(1, "Attempt to destroy undead association %p!\n", asoc); return; } sctp_endpoint_put(asoc->ep); sock_put(asoc->base.sk); if (asoc->assoc_id != 0) { spin_lock_bh(&sctp_assocs_id_lock); idr_remove(&sctp_assocs_id, asoc->assoc_id); spin_unlock_bh(&sctp_assocs_id_lock); } WARN_ON(atomic_read(&asoc->rmem_alloc)); kfree_rcu(asoc, rcu); SCTP_DBG_OBJCNT_DEC(assoc); } /* Change the primary destination address for the peer. */ void sctp_assoc_set_primary(struct sctp_association *asoc, struct sctp_transport *transport) { int changeover = 0; /* it's a changeover only if we already have a primary path * that we are changing */ if (asoc->peer.primary_path != NULL && asoc->peer.primary_path != transport) changeover = 1 ; asoc->peer.primary_path = transport; sctp_ulpevent_notify_peer_addr_change(transport, SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ memcpy(&asoc->peer.primary_addr, &transport->ipaddr, sizeof(union sctp_addr)); /* If the primary path is changing, assume that the * user wants to use this new path. */ if ((transport->state == SCTP_ACTIVE) || (transport->state == SCTP_UNKNOWN)) asoc->peer.active_path = transport; /* * SFR-CACC algorithm: * Upon the receipt of a request to change the primary * destination address, on the data structure for the new * primary destination, the sender MUST do the following: * * 1) If CHANGEOVER_ACTIVE is set, then there was a switch * to this destination address earlier. The sender MUST set * CYCLING_CHANGEOVER to indicate that this switch is a * double switch to the same destination address. * * Really, only bother is we have data queued or outstanding on * the association. */ if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen) return; if (transport->cacc.changeover_active) transport->cacc.cycling_changeover = changeover; /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that * a changeover has occurred. */ transport->cacc.changeover_active = changeover; /* 3) The sender MUST store the next TSN to be sent in * next_tsn_at_change. */ transport->cacc.next_tsn_at_change = asoc->next_tsn; } /* Remove a transport from an association. */ void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer) { struct sctp_transport *transport; struct list_head *pos; struct sctp_chunk *ch; pr_debug("%s: association:%p addr:%pISpc\n", __func__, asoc, &peer->ipaddr.sa); /* If we are to remove the current retran_path, update it * to the next peer before removing this peer from the list. */ if (asoc->peer.retran_path == peer) sctp_assoc_update_retran_path(asoc); /* Remove this peer from the list. */ list_del_rcu(&peer->transports); /* Remove this peer from the transport hashtable */ sctp_unhash_transport(peer); /* Get the first transport of asoc. */ pos = asoc->peer.transport_addr_list.next; transport = list_entry(pos, struct sctp_transport, transports); /* Update any entries that match the peer to be deleted. */ if (asoc->peer.primary_path == peer) sctp_assoc_set_primary(asoc, transport); if (asoc->peer.active_path == peer) asoc->peer.active_path = transport; if (asoc->peer.retran_path == peer) asoc->peer.retran_path = transport; if (asoc->peer.last_data_from == peer) asoc->peer.last_data_from = transport; if (asoc->strreset_chunk && asoc->strreset_chunk->transport == peer) { asoc->strreset_chunk->transport = transport; sctp_transport_reset_reconf_timer(transport); } /* If we remove the transport an INIT was last sent to, set it to * NULL. Combined with the update of the retran path above, this * will cause the next INIT to be sent to the next available * transport, maintaining the cycle. */ if (asoc->init_last_sent_to == peer) asoc->init_last_sent_to = NULL; /* If we remove the transport an SHUTDOWN was last sent to, set it * to NULL. Combined with the update of the retran path above, this * will cause the next SHUTDOWN to be sent to the next available * transport, maintaining the cycle. */ if (asoc->shutdown_last_sent_to == peer) asoc->shutdown_last_sent_to = NULL; /* If we remove the transport an ASCONF was last sent to, set it to * NULL. */ if (asoc->addip_last_asconf && asoc->addip_last_asconf->transport == peer) asoc->addip_last_asconf->transport = NULL; /* If we have something on the transmitted list, we have to * save it off. The best place is the active path. */ if (!list_empty(&peer->transmitted)) { struct sctp_transport *active = asoc->peer.active_path; /* Reset the transport of each chunk on this list */ list_for_each_entry(ch, &peer->transmitted, transmitted_list) { ch->transport = NULL; ch->rtt_in_progress = 0; } list_splice_tail_init(&peer->transmitted, &active->transmitted); /* Start a T3 timer here in case it wasn't running so * that these migrated packets have a chance to get * retransmitted. */ if (!timer_pending(&active->T3_rtx_timer)) if (!mod_timer(&active->T3_rtx_timer, jiffies + active->rto)) sctp_transport_hold(active); } list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) if (ch->transport == peer) ch->transport = NULL; asoc->peer.transport_count--; sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } /* Add a transport address to an association. */ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, const union sctp_addr *addr, const gfp_t gfp, const int peer_state) { struct sctp_transport *peer; struct sctp_sock *sp; unsigned short port; sp = sctp_sk(asoc->base.sk); /* AF_INET and AF_INET6 share common port field. */ port = ntohs(addr->v4.sin_port); pr_debug("%s: association:%p addr:%pISpc state:%d\n", __func__, asoc, &addr->sa, peer_state); /* Set the port if it has not been set yet. */ if (0 == asoc->peer.port) asoc->peer.port = port; /* Check to see if this is a duplicate. */ peer = sctp_assoc_lookup_paddr(asoc, addr); if (peer) { /* An UNKNOWN state is only set on transports added by * user in sctp_connectx() call. Such transports should be * considered CONFIRMED per RFC 4960, Section 5.4. */ if (peer->state == SCTP_UNKNOWN) { peer->state = SCTP_ACTIVE; } return peer; } peer = sctp_transport_new(asoc->base.net, addr, gfp); if (!peer) return NULL; sctp_transport_set_owner(peer, asoc); /* Initialize the peer's heartbeat interval based on the * association configured value. */ peer->hbinterval = asoc->hbinterval; peer->probe_interval = asoc->probe_interval; peer->encap_port = asoc->encap_port; /* Set the path max_retrans. */ peer->pathmaxrxt = asoc->pathmaxrxt; /* And the partial failure retrans threshold */ peer->pf_retrans = asoc->pf_retrans; /* And the primary path switchover retrans threshold */ peer->ps_retrans = asoc->ps_retrans; /* Initialize the peer's SACK delay timeout based on the * association configured value. */ peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; if (addr->sa.sa_family == AF_INET6) { __be32 info = addr->v6.sin6_flowinfo; if (info) { peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK); peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } else { peer->flowlabel = asoc->flowlabel; } } peer->dscp = asoc->dscp; /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ peer->param_flags = asoc->param_flags; /* Initialize the pmtu of the transport. */ sctp_transport_route(peer, NULL, sp); /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ sctp_assoc_set_pmtu(asoc, asoc->pathmtu ? min_t(int, peer->pathmtu, asoc->pathmtu) : peer->pathmtu); peer->pmtu_pending = 0; /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. */ sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port, asoc->peer.port); /* 7.2.1 Slow-Start * * o The initial cwnd before DATA transmission or after a sufficiently * long idle period MUST be set to * min(4*MTU, max(2*MTU, 4380 bytes)) * * o The initial value of ssthresh MAY be arbitrarily high * (for example, implementations MAY use the size of the * receiver advertised window). */ peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set * later when we process the INIT. */ peer->ssthresh = SCTP_DEFAULT_MAXWINDOW; peer->partial_bytes_acked = 0; peer->flight_size = 0; peer->burst_limited = 0; /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; sctp_max_rto(asoc, peer); /* Set the peer's active state. */ peer->state = peer_state; /* Add this peer into the transport hashtable */ if (sctp_hash_transport(peer)) { sctp_transport_free(peer); return NULL; } sctp_transport_pl_reset(peer); /* Attach the remote transport to our asoc. */ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { sctp_assoc_set_primary(asoc, peer); asoc->peer.retran_path = peer; } if (asoc->peer.active_path == asoc->peer.retran_path && peer->state != SCTP_UNCONFIRMED) { asoc->peer.retran_path = peer; } return peer; } /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, const union sctp_addr *address) { struct sctp_transport *t; /* Cycle through all transports searching for a peer address. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (sctp_cmp_addr_exact(address, &t->ipaddr)) return t; } return NULL; } /* Remove all transports except a give one */ void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc, struct sctp_transport *primary) { struct sctp_transport *temp; struct sctp_transport *t; list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list, transports) { /* if the current transport is not the primary one, delete it */ if (t != primary) sctp_assoc_rm_peer(asoc, t); } } /* Engage in transport control operations. * Mark the transport up or down and send a notification to the user. * Select and update the new active and retran paths. */ void sctp_assoc_control_transport(struct sctp_association *asoc, struct sctp_transport *transport, enum sctp_transport_cmd command, sctp_sn_error_t error) { int spc_state = SCTP_ADDR_AVAILABLE; bool ulp_notify = true; /* Record the transition on the transport. */ switch (command) { case SCTP_TRANSPORT_UP: /* If we are moving from UNCONFIRMED state due * to heartbeat success, report the SCTP_ADDR_CONFIRMED * state to the user, otherwise report SCTP_ADDR_AVAILABLE. */ if (transport->state == SCTP_PF && asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; else if (transport->state == SCTP_UNCONFIRMED && error == SCTP_HEARTBEAT_SUCCESS) spc_state = SCTP_ADDR_CONFIRMED; transport->state = SCTP_ACTIVE; sctp_transport_pl_reset(transport); break; case SCTP_TRANSPORT_DOWN: /* If the transport was never confirmed, do not transition it * to inactive state. Also, release the cached route since * there may be a better route next time. */ if (transport->state != SCTP_UNCONFIRMED) { transport->state = SCTP_INACTIVE; sctp_transport_pl_reset(transport); spc_state = SCTP_ADDR_UNREACHABLE; } else { sctp_transport_dst_release(transport); ulp_notify = false; } break; case SCTP_TRANSPORT_PF: transport->state = SCTP_PF; if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE) ulp_notify = false; else spc_state = SCTP_ADDR_POTENTIALLY_FAILED; break; default: return; } /* Generate and send a SCTP_PEER_ADDR_CHANGE notification * to the user. */ if (ulp_notify) sctp_ulpevent_notify_peer_addr_change(transport, spc_state, error); /* Select new active and retran paths. */ sctp_select_active_and_retran_path(asoc); } /* Hold a reference to an association. */ void sctp_association_hold(struct sctp_association *asoc) { refcount_inc(&asoc->base.refcnt); } /* Release a reference to an association and cleanup * if there are no more references. */ void sctp_association_put(struct sctp_association *asoc) { if (refcount_dec_and_test(&asoc->base.refcnt)) sctp_association_destroy(asoc); } /* Allocate the next TSN, Transmission Sequence Number, for the given * association. */ __u32 sctp_association_get_next_tsn(struct sctp_association *asoc) { /* From Section 1.6 Serial Number Arithmetic: * Transmission Sequence Numbers wrap around when they reach * 2**32 - 1. That is, the next TSN a DATA chunk MUST use * after transmitting TSN = 2*32 - 1 is TSN = 0. */ __u32 retval = asoc->next_tsn; asoc->next_tsn++; asoc->unack_data++; return retval; } /* Compare two addresses to see if they match. Wildcard addresses * only match themselves. */ int sctp_cmp_addr_exact(const union sctp_addr *ss1, const union sctp_addr *ss2) { struct sctp_af *af; af = sctp_get_af_specific(ss1->sa.sa_family); if (unlikely(!af)) return 0; return af->cmp_addr(ss1, ss2); } /* Return an ecne chunk to get prepended to a packet. * Note: We are sly and return a shared, prealloced chunk. FIXME: * No we don't, but we could/should. */ struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc) { if (!asoc->need_ecne) return NULL; /* Send ECNE if needed. * Not being able to allocate a chunk here is not deadly. */ return sctp_make_ecne(asoc, asoc->last_ecne_tsn); } /* * Find which transport this TSN was sent on. */ struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, __u32 tsn) { struct sctp_transport *active; struct sctp_transport *match; struct sctp_transport *transport; struct sctp_chunk *chunk; __be32 key = htonl(tsn); match = NULL; /* * FIXME: In general, find a more efficient data structure for * searching. */ /* * The general strategy is to search each transport's transmitted * list. Return which transport this TSN lives on. * * Let's be hopeful and check the active_path first. * Another optimization would be to know if there is only one * outbound path and not have to look for the TSN at all. * */ active = asoc->peer.active_path; list_for_each_entry(chunk, &active->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { match = active; goto out; } } /* If not found, go search all the other transports. */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { if (transport == active) continue; list_for_each_entry(chunk, &transport->transmitted, transmitted_list) { if (key == chunk->subh.data_hdr->tsn) { match = transport; goto out; } } } out: return match; } /* Do delayed input processing. This is scheduled by sctp_rcv(). */ static void sctp_assoc_bh_rcv(struct work_struct *work) { struct sctp_association *asoc = container_of(work, struct sctp_association, base.inqueue.immediate); struct net *net = asoc->base.net; union sctp_subtype subtype; struct sctp_endpoint *ep; struct sctp_chunk *chunk; struct sctp_inq *inqueue; int first_time = 1; /* is this the first time through the loop */ int error = 0; int state; /* The association should be held so we should be safe. */ ep = asoc->ep; inqueue = &asoc->base.inqueue; sctp_association_hold(asoc); while (NULL != (chunk = sctp_inq_pop(inqueue))) { state = asoc->state; subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); /* If the first chunk in the packet is AUTH, do special * processing specified in Section 6.3 of SCTP-AUTH spec */ if (first_time && subtype.chunk == SCTP_CID_AUTH) { struct sctp_chunkhdr *next_hdr; next_hdr = sctp_inq_peek(inqueue); if (!next_hdr) goto normal; /* If the next chunk is COOKIE-ECHO, skip the AUTH * chunk while saving a pointer to it so we can do * Authentication later (during cookie-echo * processing). */ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { chunk->auth_chunk = skb_clone(chunk->skb, GFP_ATOMIC); chunk->auth = 1; continue; } } normal: /* SCTP-AUTH, Section 6.3: * The receiver has a list of chunk types which it expects * to be received only after an AUTH-chunk. This list has * been sent to the peer during the association setup. It * MUST silently discard these chunks if they are not placed * after an AUTH chunk in the packet. */ if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) continue; /* Remember where the last DATA chunk came from so we * know where to send the SACK. */ if (sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { SCTP_INC_STATS(net, SCTP_MIB_INCTRLCHUNKS); asoc->stats.ictrlchunks++; if (chunk->chunk_hdr->type == SCTP_CID_SACK) asoc->stats.isacks++; } if (chunk->transport) chunk->transport->last_time_heard = ktime_get(); /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); /* Check to see if the association is freed in response to * the incoming chunk. If so, get out of the while loop. */ if (asoc->base.dead) break; /* If there is an error on chunk, discard this packet. */ if (error && chunk) chunk->pdiscard = 1; if (first_time) first_time = 0; } sctp_association_put(asoc); } /* This routine moves an association from its old sk to a new sk. */ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) { struct sctp_sock *newsp = sctp_sk(newsk); struct sock *oldsk = assoc->base.sk; /* Delete the association from the old endpoint's list of * associations. */ list_del_init(&assoc->asocs); /* Decrement the backlog value for a TCP-style socket. */ if (sctp_style(oldsk, TCP)) sk_acceptq_removed(oldsk); /* Release references to the old endpoint and the sock. */ sctp_endpoint_put(assoc->ep); sock_put(assoc->base.sk); /* Get a reference to the new endpoint. */ assoc->ep = newsp->ep; sctp_endpoint_hold(assoc->ep); /* Get a reference to the new sock. */ assoc->base.sk = newsk; sock_hold(assoc->base.sk); /* Add the association to the new endpoint's list of associations. */ sctp_endpoint_add_asoc(newsp->ep, assoc); } /* Update an association (possibly from unexpected COOKIE-ECHO processing). */ int sctp_assoc_update(struct sctp_association *asoc, struct sctp_association *new) { struct sctp_transport *trans; struct list_head *pos, *temp; /* Copy in new parameters of peer. */ asoc->c = new->c; asoc->peer.rwnd = new->peer.rwnd; asoc->peer.sack_needed = new->peer.sack_needed; asoc->peer.auth_capable = new->peer.auth_capable; asoc->peer.i = new->peer.i; if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, GFP_ATOMIC)) return -ENOMEM; /* Remove any peer addresses not present in the new association. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { trans = list_entry(pos, struct sctp_transport, transports); if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) { sctp_assoc_rm_peer(asoc, trans); continue; } if (asoc->state >= SCTP_STATE_ESTABLISHED) sctp_transport_reset(trans); } /* If the case is A (association restart), use * initial_tsn as next_tsn. If the case is B, use * current next_tsn in case data sent to peer * has been discarded and needs retransmission. */ if (asoc->state >= SCTP_STATE_ESTABLISHED) { asoc->next_tsn = new->next_tsn; asoc->ctsn_ack_point = new->ctsn_ack_point; asoc->adv_peer_ack_point = new->adv_peer_ack_point; /* Reinitialize SSN for both local streams * and peer's streams. */ sctp_stream_clear(&asoc->stream); /* Flush the ULP reassembly and ordered queue. * Any data there will now be stale and will * cause problems. */ sctp_ulpq_flush(&asoc->ulpq); /* reset the overall association error count so * that the restarted association doesn't get torn * down on the next retransmission timer. */ asoc->overall_error_count = 0; } else { /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; if (sctp_state(asoc, COOKIE_WAIT)) sctp_stream_update(&asoc->stream, &new->stream); /* get a new assoc id if we don't have one yet. */ if (sctp_assoc_set_id(asoc, GFP_ATOMIC)) return -ENOMEM; } /* SCTP-AUTH: Save the peer parameters from the new associations * and also move the association shared keys over */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = new->peer.peer_random; new->peer.peer_random = NULL; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = new->peer.peer_chunks; new->peer.peer_chunks = NULL; kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = new->peer.peer_hmacs; new->peer.peer_hmacs = NULL; return sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); } /* Update the retran path for sending a retransmitted packet. * See also RFC4960, 6.4. Multi-Homed SCTP Endpoints: * * When there is outbound data to send and the primary path * becomes inactive (e.g., due to failures), or where the * SCTP user explicitly requests to send data to an * inactive destination transport address, before reporting * an error to its ULP, the SCTP endpoint should try to send * the data to an alternate active destination transport * address if one exists. * * When retransmitting data that timed out, if the endpoint * is multihomed, it should consider each source-destination * address pair in its retransmission selection policy. * When retransmitting timed-out data, the endpoint should * attempt to pick the most divergent source-destination * pair from the original source-destination pair to which * the packet was transmitted. * * Note: Rules for picking the most divergent source-destination * pair are an implementation decision and are not specified * within this document. * * Our basic strategy is to round-robin transports in priorities * according to sctp_trans_score() e.g., if no such * transport with state SCTP_ACTIVE exists, round-robin through * SCTP_UNKNOWN, etc. You get the picture. */ static u8 sctp_trans_score(const struct sctp_transport *trans) { switch (trans->state) { case SCTP_ACTIVE: return 3; /* best case */ case SCTP_UNKNOWN: return 2; case SCTP_PF: return 1; default: /* case SCTP_INACTIVE */ return 0; /* worst case */ } } static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, struct sctp_transport *trans2) { if (trans1->error_count > trans2->error_count) { return trans2; } else if (trans1->error_count == trans2->error_count && ktime_after(trans2->last_time_heard, trans1->last_time_heard)) { return trans2; } else { return trans1; } } static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, struct sctp_transport *best) { u8 score_curr, score_best; if (best == NULL || curr == best) return curr; score_curr = sctp_trans_score(curr); score_best = sctp_trans_score(best); /* First, try a score-based selection if both transport states * differ. If we're in a tie, lets try to make a more clever * decision here based on error counts and last time heard. */ if (score_curr > score_best) return curr; else if (score_curr == score_best) return sctp_trans_elect_tie(best, curr); else return best; } void sctp_assoc_update_retran_path(struct sctp_association *asoc) { struct sctp_transport *trans = asoc->peer.retran_path; struct sctp_transport *trans_next = NULL; /* We're done as we only have the one and only path. */ if (asoc->peer.transport_count == 1) return; /* If active_path and retran_path are the same and active, * then this is the only active path. Use it. */ if (asoc->peer.active_path == asoc->peer.retran_path && asoc->peer.active_path->state == SCTP_ACTIVE) return; /* Iterate from retran_path's successor back to retran_path. */ for (trans = list_next_entry(trans, transports); 1; trans = list_next_entry(trans, transports)) { /* Manually skip the head element. */ if (&trans->transports == &asoc->peer.transport_addr_list) continue; if (trans->state == SCTP_UNCONFIRMED) continue; trans_next = sctp_trans_elect_best(trans, trans_next); /* Active is good enough for immediate return. */ if (trans_next->state == SCTP_ACTIVE) break; /* We've reached the end, time to update path. */ if (trans == asoc->peer.retran_path) break; } asoc->peer.retran_path = trans_next; pr_debug("%s: association:%p updated new path to addr:%pISpc\n", __func__, asoc, &asoc->peer.retran_path->ipaddr.sa); } static void sctp_select_active_and_retran_path(struct sctp_association *asoc) { struct sctp_transport *trans, *trans_pri = NULL, *trans_sec = NULL; struct sctp_transport *trans_pf = NULL; /* Look for the two most recently used active transports. */ list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { /* Skip uninteresting transports. */ if (trans->state == SCTP_INACTIVE || trans->state == SCTP_UNCONFIRMED) continue; /* Keep track of the best PF transport from our * list in case we don't find an active one. */ if (trans->state == SCTP_PF) { trans_pf = sctp_trans_elect_best(trans, trans_pf); continue; } /* For active transports, pick the most recent ones. */ if (trans_pri == NULL || ktime_after(trans->last_time_heard, trans_pri->last_time_heard)) { trans_sec = trans_pri; trans_pri = trans; } else if (trans_sec == NULL || ktime_after(trans->last_time_heard, trans_sec->last_time_heard)) { trans_sec = trans; } } /* RFC 2960 6.4 Multi-Homed SCTP Endpoints * * By default, an endpoint should always transmit to the primary * path, unless the SCTP user explicitly specifies the * destination transport address (and possibly source transport * address) to use. [If the primary is active but not most recent, * bump the most recently used transport.] */ if ((asoc->peer.primary_path->state == SCTP_ACTIVE || asoc->peer.primary_path->state == SCTP_UNKNOWN) && asoc->peer.primary_path != trans_pri) { trans_sec = trans_pri; trans_pri = asoc->peer.primary_path; } /* We did not find anything useful for a possible retransmission * path; either primary path that we found is the same as * the current one, or we didn't generally find an active one. */ if (trans_sec == NULL) trans_sec = trans_pri; /* If we failed to find a usable transport, just camp on the * active or pick a PF iff it's the better choice. */ if (trans_pri == NULL) { trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf); trans_sec = trans_pri; } /* Set the active and retran transports. */ asoc->peer.active_path = trans_pri; asoc->peer.retran_path = trans_sec; } struct sctp_transport * sctp_assoc_choose_alter_transport(struct sctp_association *asoc, struct sctp_transport *last_sent_to) { /* If this is the first time packet is sent, use the active path, * else use the retran path. If the last packet was sent over the * retran path, update the retran path and use it. */ if (last_sent_to == NULL) { return asoc->peer.active_path; } else { if (last_sent_to == asoc->peer.retran_path) sctp_assoc_update_retran_path(asoc); return asoc->peer.retran_path; } } void sctp_assoc_update_frag_point(struct sctp_association *asoc) { int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu, sctp_datachk_len(&asoc->stream)); if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag); frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN - sctp_datachk_len(&asoc->stream)); asoc->frag_point = SCTP_TRUNC4(frag); } void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu) { if (asoc->pathmtu != pmtu) { asoc->pathmtu = pmtu; sctp_assoc_update_frag_point(asoc); } pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc, asoc->pathmtu, asoc->frag_point); } /* Update the association's pmtu and frag_point by going through all the * transports. This routine is called when a transport's PMTU has changed. */ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) { struct sctp_transport *t; __u32 pmtu = 0; if (!asoc) return; /* Get the lowest pmtu of all the transports. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { sctp_transport_update_pmtu(t, atomic_read(&t->mtu_info)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) pmtu = t->pathmtu; } sctp_assoc_set_pmtu(asoc, pmtu); } /* Should we send a SACK to update our peer? */ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) { struct net *net = asoc->base.net; switch (asoc->state) { case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32, (asoc->base.sk->sk_rcvbuf >> net->sctp.rwnd_upd_shift), asoc->pathmtu))) return true; break; default: break; } return false; } /* Increase asoc's rwnd by len and send any window update SACK if needed. */ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) { struct sctp_chunk *sack; struct timer_list *timer; if (asoc->rwnd_over) { if (asoc->rwnd_over >= len) { asoc->rwnd_over -= len; } else { asoc->rwnd += (len - asoc->rwnd_over); asoc->rwnd_over = 0; } } else { asoc->rwnd += len; } /* If we had window pressure, start recovering it * once our rwnd had reached the accumulated pressure * threshold. The idea is to recover slowly, but up * to the initial advertised window. */ if (asoc->rwnd_press) { int change = min(asoc->pathmtu, asoc->rwnd_press); asoc->rwnd += change; asoc->rwnd_press -= change; } pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, asoc->a_rwnd); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ if (sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " "a_rwnd:%u\n", __func__, asoc, asoc->rwnd, asoc->a_rwnd); sack = sctp_make_sack(asoc); if (!sack) return; asoc->peer.sack_needed = 0; sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC); /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; if (timer_delete(timer)) sctp_association_put(asoc); } } /* Decrease asoc's rwnd by len. */ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) { int rx_count; int over = 0; if (unlikely(!asoc->rwnd || asoc->rwnd_over)) pr_debug("%s: association:%p has asoc->rwnd:%u, " "asoc->rwnd_over:%u!\n", __func__, asoc, asoc->rwnd, asoc->rwnd_over); if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); /* If we've reached or overflowed our receive buffer, announce * a 0 rwnd if rwnd would still be positive. Store the * potential pressure overflow so that the window can be restored * back to original value. */ if (rx_count >= asoc->base.sk->sk_rcvbuf) over = 1; if (asoc->rwnd >= len) { asoc->rwnd -= len; if (over) { asoc->rwnd_press += asoc->rwnd; asoc->rwnd = 0; } } else { asoc->rwnd_over += len - asoc->rwnd; asoc->rwnd = 0; } pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, asoc->rwnd_press); } /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. */ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, enum sctp_scope scope, gfp_t gfp) { struct sock *sk = asoc->base.sk; int flags; /* Use scoping rules to determine the subset of addresses from * the endpoint. */ flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; if (!inet_v6_ipv6only(sk)) flags |= SCTP_ADDR4_ALLOWED; if (asoc->peer.ipv4_address) flags |= SCTP_ADDR4_PEERSUPP; if (asoc->peer.ipv6_address) flags |= SCTP_ADDR6_PEERSUPP; return sctp_bind_addr_copy(asoc->base.net, &asoc->base.bind_addr, &asoc->ep->base.bind_addr, scope, gfp, flags); } /* Build the association's bind address list from the cookie. */ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, struct sctp_cookie *cookie, gfp_t gfp) { struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1); int var_size2 = ntohs(peer_init->chunk_hdr.length); int var_size3 = cookie->raw_addr_list_len; __u8 *raw = (__u8 *)peer_init + var_size2; return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3, asoc->ep->base.bind_addr.port, gfp); } /* Lookup laddr in the bind address list of an association. */ int sctp_assoc_lookup_laddr(struct sctp_association *asoc, const union sctp_addr *laddr) { int found = 0; if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) && sctp_bind_addr_match(&asoc->base.bind_addr, laddr, sctp_sk(asoc->base.sk))) found = 1; return found; } /* Set an association id for a given association */ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) { bool preload = gfpflags_allow_blocking(gfp); int ret; /* If the id is already assigned, keep it. */ if (asoc->assoc_id) return 0; if (preload) idr_preload(gfp); spin_lock_bh(&sctp_assocs_id_lock); /* 0, 1, 2 are used as SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and * SCTP_ALL_ASSOC, so an available id must be > SCTP_ALL_ASSOC. */ ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, SCTP_ALL_ASSOC + 1, 0, GFP_NOWAIT); spin_unlock_bh(&sctp_assocs_id_lock); if (preload) idr_preload_end(); if (ret < 0) return ret; asoc->assoc_id = (sctp_assoc_t)ret; return 0; } /* Free the ASCONF queue */ static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc) { struct sctp_chunk *asconf; struct sctp_chunk *tmp; list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) { list_del_init(&asconf->list); sctp_chunk_free(asconf); } } /* Free asconf_ack cache */ static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc) { struct sctp_chunk *ack; struct sctp_chunk *tmp; list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, transmitted_list) { list_del_init(&ack->transmitted_list); sctp_chunk_free(ack); } } /* Clean up the ASCONF_ACK queue */ void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc) { struct sctp_chunk *ack; struct sctp_chunk *tmp; /* We can remove all the entries from the queue up to * the "Peer-Sequence-Number". */ list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list, transmitted_list) { if (ack->subh.addip_hdr->serial == htonl(asoc->peer.addip_serial)) break; list_del_init(&ack->transmitted_list); sctp_chunk_free(ack); } } /* Find the ASCONF_ACK whose serial number matches ASCONF */ struct sctp_chunk *sctp_assoc_lookup_asconf_ack( const struct sctp_association *asoc, __be32 serial) { struct sctp_chunk *ack; /* Walk through the list of cached ASCONF-ACKs and find the * ack chunk whose serial number matches that of the request. */ list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) { if (sctp_chunk_pending(ack)) continue; if (ack->subh.addip_hdr->serial == serial) { sctp_chunk_hold(ack); return ack; } } return NULL; } void sctp_asconf_queue_teardown(struct sctp_association *asoc) { /* Free any cached ASCONF_ACK chunk. */ sctp_assoc_free_asconf_acks(asoc); /* Free the ASCONF queue. */ sctp_assoc_free_asconf_queue(asoc); /* Free any cached ASCONF chunk. */ if (asoc->addip_last_asconf) sctp_chunk_free(asoc->addip_last_asconf); }
16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/mount.c - kernfs mount implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> #include <linux/uuid.h> #include <linux/statfs.h> #include "kernfs-internal.h" struct kmem_cache *kernfs_node_cache __ro_after_init; struct kmem_cache *kernfs_iattrs_cache __ro_after_init; struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) return scops->show_options(sf, root); return 0; } static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_path) return scops->show_path(sf, node, root); seq_dentry(sf, dentry, " \t\n\\"); return 0; } static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) { simple_statfs(dentry, buf); buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, /* * sysfs is built on top of kernfs and sysfs provides the power * management infrastructure to support suspend/hibernate by * writing to various files in /sys/power/. As filesystems may * be automatically frozen during suspend/hibernate implementing * freeze/thaw support for kernfs generically will cause * deadlocks as the suspending/hibernation initiating task will * hold a VFS lock that it will then wait upon to be released. * If freeze/thaw for kernfs is needed talk to the VFS. */ .freeze_fs = NULL, .unfreeze_fs = NULL, .freeze_super = NULL, .thaw_super = NULL, }; static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct kernfs_node *kn = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = kn->id; return FILEID_KERNFS; } static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *kn; struct inode *inode; u64 id; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: id = *(u64 *)fid; break; case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* * blk_log_action() exposes "LOW32,HIGH32" pair without * type and userland can call us with generic fid * constructed from them. Combine it back to ID. See * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; default: return NULL; } kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); if (get_parent) { struct kernfs_node *parent; parent = kernfs_get_parent(kn); kernfs_put(kn); kn = parent; if (!kn) return ERR_PTR(-ESTALE); } inode = kernfs_get_inode(sb, kn); kernfs_put(kn); return d_obtain_alias(inode); } static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) { struct kernfs_node *kn = kernfs_dentry_node(child); struct kernfs_root *root = kernfs_root(kn); guard(rwsem_read)(&root->kernfs_rwsem); return d_obtain_alias(kernfs_get_inode(child->d_sb, kernfs_parent(kn))); } static const struct export_operations kernfs_export_ops = { .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, }; /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question * * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, * %NULL is returned. */ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { if (sb->s_op == &kernfs_sops) return kernfs_info(sb)->root; return NULL; } /* * find the next ancestor in the path down to @child, where @parent was the * ancestor whose descendant we want to find. * * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root * node. If @parent is b, then we return the node for c. * Passing in d as @parent is not ok. */ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, struct kernfs_node *parent) { if (child == parent) { pr_crit_once("BUG in find_next_ancestor: called with parent == child"); return NULL; } while (kernfs_parent(child) != parent) { child = kernfs_parent(child); if (!child) return NULL; } return child; } /** * kernfs_node_dentry - get a dentry for the given kernfs_node * @kn: kernfs_node for which a dentry is needed * @sb: the kernfs super_block * * Return: the dentry pointer */ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) { struct dentry *dentry; struct kernfs_node *knparent; struct kernfs_root *root; BUG_ON(sb->s_op != &kernfs_sops); dentry = dget(sb->s_root); /* Check if this is the root kernfs_node */ if (!rcu_access_pointer(kn->__parent)) return dentry; root = kernfs_root(kn); /* * As long as kn is valid, its parent can not vanish. This is cgroup's * kn so it can't have its parent replaced. Therefore it is safe to use * the ancestor node outside of the RCU or locked section. */ if (WARN_ON_ONCE(!(root->flags & KERNFS_ROOT_INVARIANT_PARENT))) return ERR_PTR(-EINVAL); scoped_guard(rcu) { knparent = find_next_ancestor(kn, NULL); } if (WARN_ON(!knparent)) { dput(dentry); return ERR_PTR(-EINVAL); } do { struct dentry *dtmp; struct kernfs_node *kntmp; const char *name; if (kn == knparent) return dentry; scoped_guard(rwsem_read, &root->kernfs_rwsem) { kntmp = find_next_ancestor(kn, knparent); if (WARN_ON(!kntmp)) { dput(dentry); return ERR_PTR(-EINVAL); } name = kstrdup(kernfs_rcu_name(kntmp), GFP_KERNEL); } if (!name) { dput(dentry); return ERR_PTR(-ENOMEM); } dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry); dput(dentry); kfree(name); if (IS_ERR(dtmp)) return dtmp; knparent = kntmp; dentry = dtmp; } while (true); } static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; info->sb = sb; /* Userspace would break if executables or devices appear on sysfs */ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; /* sysfs dentries and inodes don't require IO to create */ sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; } /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } sb->s_root = root; set_default_d_op(sb, &kernfs_dops); return 0; } static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; kfc->ns_tag = NULL; return set_anon_super_fc(sb, fc); } /** * kernfs_super_ns - determine the namespace tag of a kernfs super_block * @sb: super_block of interest * * Return: the namespace tag associated with kernfs super_block @sb. */ const void *kernfs_super_ns(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); return info->ns; } /** * kernfs_get_tree - kernfs filesystem access/retrieval helper * @fc: The filesystem context. * * This is to be called from each kernfs user's fs_context->ops->get_tree() * implementation, which should set the specified ->@fs_type and ->@flags, and * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, * respectively. * * Return: %0 on success, -errno on failure. */ int kernfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc_obj(*info); if (!info) return -ENOMEM; info->root = kfc->root; info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); fc->s_fs_info = info; sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); return error; } sb->s_flags |= SB_ACTIVE; uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); return 0; } void kernfs_free_fs_context(struct fs_context *fc) { /* Note that we don't deal with kfc->ns_tag here. */ kfree(fc->s_fs_info); fc->s_fs_info = NULL; } /** * kernfs_kill_sb - kill_sb for kernfs * @sb: super_block being killed * * This can be used directly for file_system_type->kill_sb(). If a kernfs * user needs extra cleanup, it can implement its own kill_sb() and call * this function at the end. */ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; down_write(&root->kernfs_supers_rwsem); list_del(&info->node); up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. */ kill_anon_super(sb); kfree(info); } static void __init kernfs_mutex_init(void) { int count; for (count = 0; count < NR_KERNFS_LOCKS; count++) mutex_init(&kernfs_locks->open_file_mutex[count]); } static void __init kernfs_lock_init(void) { kernfs_locks = kmalloc_obj(struct kernfs_global_locks); WARN_ON(!kernfs_locks); kernfs_mutex_init(); } void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", sizeof(struct kernfs_iattrs), 0, SLAB_PANIC, NULL); kernfs_lock_init(); }
163 7 7 7 7 571 421 164 164 163 164 4 3 1 5 5 12 168 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> #include <linux/nstree.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc_obj(struct cgroup_namespace, GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; ns_tree_add(new_ns); return new_ns; } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, };
6 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic timeout handling of requests. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fault-inject.h> #include "blk.h" #include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT static DECLARE_FAULT_ATTR(fail_io_timeout); static int __init setup_fail_io_timeout(char *str) { return setup_fault_attr(&fail_io_timeout, str); } __setup("fail_io_timeout=", setup_fail_io_timeout); bool __blk_should_fake_timeout(struct request_queue *q) { return should_fail(&fail_io_timeout, 1); } EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", NULL, &fail_io_timeout); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); return sprintf(buf, "%d\n", set != 0); } ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); int val; if (count) { struct request_queue *q = disk->queue; char *p = (char *) buf; val = simple_strtoul(p, &p, 10); if (val) blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); else blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); } return count; } #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { /* * All we need to ensure is that timeout scan takes place * immediately and that scan sees the new timeout value. * No need for fancy synchronizations. */ WRITE_ONCE(req->deadline, jiffies); kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); static unsigned long blk_timeout_mask __read_mostly; static int __init blk_timeout_init(void) { blk_timeout_mask = roundup_pow_of_two(HZ) - 1; return 0; } late_initcall(blk_timeout_init); /* * Just a rough estimate, we don't care about specific values for timeouts. */ static inline unsigned long blk_round_jiffies(unsigned long j) { return (j + blk_timeout_mask) + 1; } unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; return timeout; } /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. * * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. */ if (!req->timeout) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; expiry = jiffies + req->timeout; WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { unsigned long diff = q->timeout.expires - expiry; /* * Due to added timer slack to group timers, the timer * will often be a little in front of what we asked for. * So apply some tolerance here too, otherwise we keep * modifying the timer because expires for value X * will be X + something. */ if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) mod_timer(&q->timeout, expiry); } }
60 1 2 3 4 5 6 7 8 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> long __static_call_return0(void) { return 0; } EXPORT_SYMBOL_GPL(__static_call_return0);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor network mediation definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NET_H #define __AA_NET_H #include <net/sock.h> #include <linux/path.h> #include "apparmorfs.h" #include "label.h" #include "perms.h" #include "policy.h" #define AA_MAY_SEND AA_MAY_WRITE #define AA_MAY_RECEIVE AA_MAY_READ #define AA_MAY_SHUTDOWN AA_MAY_DELETE #define AA_MAY_CONNECT AA_MAY_OPEN #define AA_MAY_ACCEPT 0x00100000 #define AA_MAY_BIND 0x00200000 #define AA_MAY_LISTEN 0x00400000 #define AA_MAY_SETOPT 0x01000000 #define AA_MAY_GETOPT 0x02000000 #define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) #define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ AA_MAY_MPROT) #define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ AA_MAY_ACCEPT) struct aa_sk_ctx { struct aa_label __rcu *label; struct aa_label __rcu *peer; struct aa_label __rcu *peer_lastupdate; /* ptr cmp only, no deref */ }; static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) { return sk->sk_security + apparmor_blob_sizes.lbs_sock; } #define DEFINE_AUDIT_NET(NAME, OP, CRED, SK, F, T, P) \ struct lsm_network_audit NAME ## _net = { .sk = (SK), \ .family = (F)}; \ DEFINE_AUDIT_DATA(NAME, \ ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ LSM_AUDIT_DATA_NONE, \ AA_CLASS_NET, \ OP); \ NAME.common.u.net = &(NAME ## _net); \ NAME.subj_cred = (CRED); \ NAME.net.type = (T); \ NAME.net.protocol = (P) #define DEFINE_AUDIT_SK(NAME, OP, CRED, SK) \ DEFINE_AUDIT_NET(NAME, OP, CRED, SK, (SK)->sk_family, (SK)->sk_type, \ (SK)->sk_protocol) struct aa_secmark { u8 audit; u8 deny; u32 secid; char *label; }; extern struct aa_sfs_entry aa_sfs_entry_network[]; extern struct aa_sfs_entry aa_sfs_entry_networkv9[]; int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy, aa_state_t state, u32 request, struct aa_perms *p, struct apparmor_audit_data *ad); /* passing in state returned by XXX_mediates_AF() */ aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state, u32 request, u16 af, int type, int protocol, struct aa_perms **p, const char **info); void audit_net_cb(struct audit_buffer *ab, void *va); int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type, int protocol); int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol); static inline int aa_profile_af_sk_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, struct sock *sk) { return aa_profile_af_perm(profile, ad, request, sk->sk_family, sk->sk_type, sk->sk_protocol); } int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct file *file); int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk); #endif /* __AA_NET_H */
3 10 10 3 4 8 7 1 9 6 9 9 6 5 4 6 7 11 11 9 2 3 5 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for assembler optimized version of 3DES * * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/des.h> #include <crypto/internal/skcipher.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> struct des3_ede_x86_ctx { struct des3_ede_ctx enc; struct des3_ede_ctx dec; }; /* regular block cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst, const u8 *src); /* 3-way parallel cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, const u8 *src); static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); } static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); } static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) { const unsigned int bsize = DES3_EDE_BLOCK_SIZE; struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { const u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ if (nbytes >= bsize * 3) { do { des3_ede_x86_64_crypt_blk_3way(expkey, wdst, wsrc); wsrc += bsize * 3; wdst += bsize * 3; nbytes -= bsize * 3; } while (nbytes >= bsize * 3); if (nbytes < bsize) goto done; } /* Handle leftovers */ do { des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc); wsrc += bsize; wdst += bsize; nbytes -= bsize; } while (nbytes >= bsize); done: err = skcipher_walk_done(&walk, nbytes); } return err; } static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->enc.expkey); } static int ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->dec.expkey); } static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 *iv = (u64 *)walk->iv; do { *dst = *src ^ *iv; des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst); iv = dst; src += 1; dst += 1; nbytes -= bsize; } while (nbytes >= bsize); *(u64 *)walk->iv = *iv; return nbytes; } static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 ivs[3 - 1]; u64 last_iv; /* Start of the last block. */ src += nbytes / bsize - 1; dst += nbytes / bsize - 1; last_iv = *src; /* Process four block batch */ if (nbytes >= bsize * 3) { do { nbytes -= bsize * 3 - bsize; src -= 3 - 1; dst -= 3 - 1; ivs[0] = src[0]; ivs[1] = src[1]; des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); dst[1] ^= ivs[0]; dst[2] ^= ivs[1]; nbytes -= bsize; if (nbytes < bsize) goto done; *dst ^= *(src - 1); src -= 1; dst -= 1; } while (nbytes >= bsize * 3); } /* Handle leftovers */ for (;;) { des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src); nbytes -= bsize; if (nbytes < bsize) break; *dst ^= *(src - 1); src -= 1; dst -= 1; } done: *dst ^= *(u64 *)walk->iv; *(u64 *)walk->iv = last_iv; return nbytes; } static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm); u32 i, j, tmp; int err; err = des3_ede_expand_key(&ctx->enc, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) { memset(ctx, 0, sizeof(*ctx)); return err; } /* Fix encryption context for this implementation and form decryption * context. */ j = DES3_EDE_EXPKEY_WORDS - 2; for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { tmp = ror32(ctx->enc.expkey[i + 1], 4); ctx->enc.expkey[i + 1] = tmp; ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0]; ctx->dec.expkey[j + 1] = tmp; } return 0; } static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return des3_ede_x86_setkey(&tfm->base, key, keylen); } static struct crypto_alg des3_ede_cipher = { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-asm", .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES3_EDE_KEY_SIZE, .cia_max_keysize = DES3_EDE_KEY_SIZE, .cia_setkey = des3_ede_x86_setkey, .cia_encrypt = des3_ede_x86_encrypt, .cia_decrypt = des3_ede_x86_decrypt, } } }; static struct skcipher_alg des3_ede_skciphers[] = { { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "ecb-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "cbc-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .ivsize = DES3_EDE_BLOCK_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static bool is_blacklisted_cpu(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; if (boot_cpu_data.x86 == 0x0f) { /* * On Pentium 4, des3_ede-x86_64 is slower than generic C * implementation because use of 64bit rotates (which are really * slow on P4). Therefore blacklist P4s. */ return true; } return false; } static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init des3_ede_x86_init(void) { int err; if (!force && is_blacklisted_cpu()) { pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); return -ENODEV; } err = crypto_register_alg(&des3_ede_cipher); if (err) return err; err = crypto_register_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); if (err) crypto_unregister_alg(&des3_ede_cipher); return err; } static void __exit des3_ede_x86_fini(void) { crypto_unregister_alg(&des3_ede_cipher); crypto_unregister_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); } module_init(des3_ede_x86_init); module_exit(des3_ede_x86_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 /* SPDX-License-Identifier: GPL-2.0 */ /* * bvec iterator * * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com> */ #ifndef __LINUX_BVEC_H #define __LINUX_BVEC_H #include <linux/highmem.h> #include <linux/bug.h> #include <linux/errno.h> #include <linux/limits.h> #include <linux/minmax.h> #include <linux/types.h> struct page; /** * struct bio_vec - a contiguous range of physical memory addresses * @bv_page: First page associated with the address range. * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * * All pages within a bio_vec starting from @bv_page are contiguous and * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; unsigned int bv_len; unsigned int bv_offset; }; /** * bvec_set_page - initialize a bvec based off a struct page * @bv: bvec to initialize * @page: page the bvec should point to * @len: length of the bvec * @offset: offset into the page */ static inline void bvec_set_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int offset) { bv->bv_page = page; bv->bv_len = len; bv->bv_offset = offset; } /** * bvec_set_folio - initialize a bvec based off a struct folio * @bv: bvec to initialize * @folio: folio the bvec should point to * @len: length of the bvec * @offset: offset into the folio */ static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio, size_t len, size_t offset) { unsigned long nr = offset / PAGE_SIZE; WARN_ON_ONCE(len > UINT_MAX); bvec_set_page(bv, folio_page(folio, nr), len, offset % PAGE_SIZE); } /** * bvec_set_virt - initialize a bvec based on a virtual address * @bv: bvec to initialize * @vaddr: virtual address to set the bvec to * @len: length of the bvec */ static inline void bvec_set_virt(struct bio_vec *bv, void *vaddr, unsigned int len) { bvec_set_page(bv, virt_to_page(vaddr), len, offset_in_page(vaddr)); } struct bvec_iter { /* * Current device address in 512 byte sectors. Only updated by the bio * iter wrappers and not the bvec iterator helpers themselves. */ sector_t bi_sector; /* * Remaining size in bytes. */ unsigned int bi_size; /* * Current index into the bvec array. This indexes into `bi_io_vec` when * iterating a bvec array that is part of a `bio`. */ unsigned int bi_idx; /* * Current offset in the bvec entry pointed to by `bi_idx`. */ unsigned int bi_bvec_done; } __packed __aligned(4); struct bvec_iter_all { struct bio_vec bv; int idx; unsigned done; }; /* * various member access, note that bio_data should of course not be used * on highmem page vectors */ #define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx]) /* multi-page (mp_bvec) helpers */ #define mp_bvec_iter_page(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_page) #define mp_bvec_iter_len(bvec, iter) \ min((iter).bi_size, \ __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done) #define mp_bvec_iter_offset(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done) #define mp_bvec_iter_page_idx(bvec, iter) \ (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE) #define mp_bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ .bv_page = mp_bvec_iter_page((bvec), (iter)), \ .bv_len = mp_bvec_iter_len((bvec), (iter)), \ .bv_offset = mp_bvec_iter_offset((bvec), (iter)), \ }) /* For building single-page bvec in flight */ #define bvec_iter_offset(bvec, iter) \ (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE) #define bvec_iter_len(bvec, iter) \ min_t(unsigned, mp_bvec_iter_len((bvec), (iter)), \ PAGE_SIZE - bvec_iter_offset((bvec), (iter))) #define bvec_iter_page(bvec, iter) \ (mp_bvec_iter_page((bvec), (iter)) + \ mp_bvec_iter_page_idx((bvec), (iter))) #define bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ .bv_page = bvec_iter_page((bvec), (iter)), \ .bv_len = bvec_iter_len((bvec), (iter)), \ .bv_offset = bvec_iter_offset((bvec), (iter)), \ }) static inline bool bvec_iter_advance(const struct bio_vec *bv, struct bvec_iter *iter, unsigned bytes) { unsigned int idx = iter->bi_idx; if (WARN_ONCE(bytes > iter->bi_size, "Attempted to advance past end of bvec iter\n")) { iter->bi_size = 0; return false; } iter->bi_size -= bytes; bytes += iter->bi_bvec_done; while (bytes && bytes >= bv[idx].bv_len) { bytes -= bv[idx].bv_len; idx++; } iter->bi_idx = idx; iter->bi_bvec_done = bytes; return true; } /* * A simpler version of bvec_iter_advance(), @bytes should not span * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len */ static inline void bvec_iter_advance_single(const struct bio_vec *bv, struct bvec_iter *iter, unsigned int bytes) { unsigned int done = iter->bi_bvec_done + bytes; if (done == bv[iter->bi_idx].bv_len) { done = 0; iter->bi_idx++; } iter->bi_bvec_done = done; iter->bi_size -= bytes; } #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) #define for_each_mp_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ .bi_sector = 0, \ .bi_size = UINT_MAX, \ .bi_idx = 0, \ .bi_bvec_done = 0, \ } static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { iter_all->done = 0; iter_all->idx = 0; return &iter_all->bv; } static inline void bvec_advance(const struct bio_vec *bvec, struct bvec_iter_all *iter_all) { struct bio_vec *bv = &iter_all->bv; if (iter_all->done) { bv->bv_page++; bv->bv_offset = 0; } else { bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT); bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); iter_all->done += bv->bv_len; if (iter_all->done == bvec->bv_len) { iter_all->idx++; iter_all->done = 0; } } /** * bvec_kmap_local - map a bvec into the kernel virtual address space * @bvec: bvec to map * * Must be called on single-page bvecs only. Call kunmap_local on the returned * address to unmap. */ static inline void *bvec_kmap_local(struct bio_vec *bvec) { return kmap_local_page(bvec->bv_page) + bvec->bv_offset; } /** * memcpy_from_bvec - copy data from a bvec * @bvec: bvec to copy from * * Must be called on single-page bvecs only. */ static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec) { memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len); } /** * memcpy_to_bvec - copy data to a bvec * @bvec: bvec to copy to * * Must be called on single-page bvecs only. */ static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from) { memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len); } /** * memzero_bvec - zero all data in a bvec * @bvec: bvec to zero * * Must be called on single-page bvecs only. */ static inline void memzero_bvec(struct bio_vec *bvec) { memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); } /** * bvec_virt - return the virtual address for a bvec * @bvec: bvec to return the virtual address for * * Note: the caller must ensure that @bvec->bv_page is not a highmem page. */ static inline void *bvec_virt(struct bio_vec *bvec) { WARN_ON_ONCE(PageHighMem(bvec->bv_page)); return page_address(bvec->bv_page) + bvec->bv_offset; } /** * bvec_phys - return the physical address for a bvec * @bvec: bvec to return the physical address for */ static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) { return page_to_phys(bvec->bv_page) + bvec->bv_offset; } #endif /* __LINUX_BVEC_H */
4 4 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include <linux/ptp_clock_kernel.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "../core/dev.h" #include "ts.h" struct tsconfig_req_info { struct ethnl_req_info base; }; struct tsconfig_reply_data { struct ethnl_reply_data base; struct hwtstamp_provider_desc hwprov_desc; struct { u32 tx_type; u32 rx_filter; u32 flags; } hwtst_config; }; #define TSCONFIG_REPDATA(__reply_base) \ container_of(__reply_base, struct tsconfig_reply_data, base) const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = { [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); struct hwtstamp_provider *hwprov = NULL; struct net_device *dev = reply_base->dev; struct kernel_hwtstamp_config cfg = {}; int ret; if (!dev->netdev_ops->ndo_hwtstamp_get) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev_get_hwtstamp_phylib(dev, &cfg); if (ret) goto out; data->hwtst_config.tx_type = BIT(cfg.tx_type); data->hwtst_config.rx_filter = BIT(cfg.rx_filter); data->hwtst_config.flags = cfg.flags; data->hwprov_desc.index = -1; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { data->hwprov_desc.index = hwprov->desc.index; data->hwprov_desc.qualifier = hwprov->desc.qualifier; } else { struct kernel_ethtool_ts_info ts_info = {}; ts_info.phc_index = -1; ret = __ethtool_get_ts_info(dev, &ts_info); if (ret) goto out; if (ts_info.phc_index == -1) return -ENODEV; data->hwprov_desc.index = ts_info.phc_index; data->hwprov_desc.qualifier = ts_info.phc_qualifier; } out: ethnl_ops_complete(dev); return ret; } static int tsconfig_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len = 0; int ret; BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (data->hwtst_config.flags) { ret = ethnl_bitset32_size(&data->hwtst_config.flags, NULL, __HWTSTAMP_FLAG_CNT, ts_flags_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ } if (data->hwtst_config.tx_type) { ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_TX_TYPES */ } if (data->hwtst_config.rx_filter) { ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSCONFIG_RX_FILTERS */ } if (data->hwprov_desc.index >= 0) /* _TSCONFIG_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); return len; } static int tsconfig_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; if (data->hwtst_config.flags) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, &data->hwtst_config.flags, NULL, __HWTSTAMP_FLAG_CNT, ts_flags_names, compact); if (ret < 0) return ret; } if (data->hwtst_config.tx_type) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES, &data->hwtst_config.tx_type, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (data->hwtst_config.rx_filter) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS, &data->hwtst_config.rx_filter, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (data->hwprov_desc.index >= 0) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, data->hwprov_desc.index) || nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, data->hwprov_desc.qualifier)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); } return 0; } /* TSCONFIG_SET */ const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = { [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, }; static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) { struct tsconfig_reply_data *reply_data; struct tsconfig_req_info *req_info; struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; req_info = kzalloc_obj(*req_info); if (!req_info) return -ENOMEM; reply_data = kmalloc_obj(*reply_data); if (!reply_data) { kfree(req_info); return -ENOMEM; } ASSERT_RTNL(); reply_data->base.dev = dev; ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info); if (ret < 0) goto err_cleanup; ret = tsconfig_reply_size(&req_info->base, &reply_data->base); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); if (!rskb) goto err_cleanup; ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); if (ret < 0) goto err_cleanup; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); err_cleanup: kfree(reply_data); kfree(req_info); return ret; } static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base, struct genl_info *info) { const struct net_device_ops *ops = req_base->dev->netdev_ops; if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get) return -EOPNOTSUPP; return 1; } static struct hwtstamp_provider * tsconfig_set_hwprov_from_desc(struct net_device *dev, struct genl_info *info, struct hwtstamp_provider_desc *hwprov_desc) { struct kernel_ethtool_ts_info ts_info; struct hwtstamp_provider *hwprov; struct nlattr **tb = info->attrs; struct phy_device *phy = NULL; enum hwtstamp_source source; int ret; ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); if (!ret) { /* Found */ source = HWTSTAMP_SOURCE_NETDEV; } else { phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); if (IS_ERR(phy)) { if (PTR_ERR(phy) == -ENODEV) NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], "phc not in this net device topology"); return ERR_CAST(phy); } source = HWTSTAMP_SOURCE_PHYLIB; } hwprov = kzalloc_obj(*hwprov); if (!hwprov) return ERR_PTR(-ENOMEM); hwprov->desc.index = hwprov_desc->index; hwprov->desc.qualifier = hwprov_desc->qualifier; hwprov->source = source; hwprov->phydev = phy; return hwprov; } static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, struct genl_info *info) { struct kernel_hwtstamp_config hwtst_config = {0}; bool hwprov_mod = false, config_mod = false; struct hwtstamp_provider *hwprov = NULL; struct net_device *dev = req_base->dev; struct nlattr **tb = info->attrs; int ret; BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (!netif_device_present(dev)) return -ENODEV; if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) { struct hwtstamp_provider_desc __hwprov_desc = {.index = -1}; struct hwtstamp_provider *__hwprov; __hwprov = rtnl_dereference(dev->hwprov); if (__hwprov) { __hwprov_desc.index = __hwprov->desc.index; __hwprov_desc.qualifier = __hwprov->desc.qualifier; } ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], &__hwprov_desc, info->extack, &hwprov_mod); if (ret < 0) return ret; if (hwprov_mod) { hwprov = tsconfig_set_hwprov_from_desc(dev, info, &__hwprov_desc); if (IS_ERR(hwprov)) return PTR_ERR(hwprov); } } /* Get current hwtstamp config if we are not changing the * hwtstamp source. It will be zeroed in the other case. */ if (!hwprov_mod) { ret = dev_get_hwtstamp_phylib(dev, &hwtst_config); if (ret < 0 && ret != -EOPNOTSUPP) goto err_free_hwprov; } /* Get the hwtstamp config from netlink */ if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) { u32 req_tx_type; req_tx_type = BIT(hwtst_config.tx_type); ret = ethnl_update_bitset32(&req_tx_type, __HWTSTAMP_TX_CNT, tb[ETHTOOL_A_TSCONFIG_TX_TYPES], ts_tx_type_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; /* Select only one tx type at a time */ if (ffs(req_tx_type) != fls(req_tx_type)) { ret = -EINVAL; goto err_free_hwprov; } hwtst_config.tx_type = ffs(req_tx_type) - 1; } if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) { u32 req_rx_filter; req_rx_filter = BIT(hwtst_config.rx_filter); ret = ethnl_update_bitset32(&req_rx_filter, __HWTSTAMP_FILTER_CNT, tb[ETHTOOL_A_TSCONFIG_RX_FILTERS], ts_rx_filter_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; /* Select only one rx filter at a time */ if (ffs(req_rx_filter) != fls(req_rx_filter)) { ret = -EINVAL; goto err_free_hwprov; } hwtst_config.rx_filter = ffs(req_rx_filter) - 1; } if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { ret = ethnl_update_bitset32(&hwtst_config.flags, __HWTSTAMP_FLAG_CNT, tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], ts_flags_names, info->extack, &config_mod); if (ret < 0) goto err_free_hwprov; } ret = net_hwtstamp_validate(&hwtst_config); if (ret) goto err_free_hwprov; if (hwprov_mod) { struct kernel_hwtstamp_config zero_config = {0}; struct hwtstamp_provider *__hwprov; /* Disable current time stamping if we try to enable * another one */ ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack); if (ret < 0) goto err_free_hwprov; /* Change the selected hwtstamp source */ __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov); if (__hwprov) kfree_rcu(__hwprov, rcu_head); } if (config_mod) { ret = dev_set_hwtstamp_phylib(dev, &hwtst_config, info->extack); if (ret < 0) return ret; } ret = tsconfig_send_reply(dev, info); if (ret && ret != -EOPNOTSUPP) { NL_SET_ERR_MSG(info->extack, "error while reading the new configuration set"); return ret; } /* tsconfig has no notification */ return 0; err_free_hwprov: kfree(hwprov); return ret; } const struct ethnl_request_ops ethnl_tsconfig_request_ops = { .request_cmd = ETHTOOL_MSG_TSCONFIG_GET, .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY, .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER, .req_info_size = sizeof(struct tsconfig_req_info), .reply_data_size = sizeof(struct tsconfig_reply_data), .prepare_data = tsconfig_prepare_data, .reply_size = tsconfig_reply_size, .fill_reply = tsconfig_fill_reply, .set_validate = ethnl_set_tsconfig_validate, .set = ethnl_set_tsconfig, };
2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" #include "rxe_task.h" #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * lockdep can detect false positive circular dependencies * when there are user-space socket API users or in kernel * users switching between a tcp and rdma transport. * Maybe also switching between siw and rxe may cause * problems as per default sockets are only classified * by family and not by ip protocol. And there might * be different locks used between the application * and the low level sockets. * * Problems were seen with ksmbd.ko and cifs.ko, * switching transports, use git blame to find * more details. */ static struct lock_class_key rxe_send_sk_key[2]; static struct lock_class_key rxe_send_slock_key[2]; #endif /* CONFIG_DEBUG_LOCK_ALLOC */ static inline void rxe_reclassify_send_socket(struct socket *sock) { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct sock *sk = sock->sk; if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) return; switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-RDMA-RXE-SEND", &rxe_send_slock_key[0], "sk_lock-AF_INET-RDMA-RXE-SEND", &rxe_send_sk_key[0]); break; case AF_INET6: sock_lock_init_class_and_name(sk, "slock-AF_INET6-RDMA-RXE-SEND", &rxe_send_slock_key[1], "sk_lock-AF_INET6-RDMA-RXE-SEND", &rxe_send_sk_key[1]); break; default: WARN_ON_ONCE(1); } #endif /* CONFIG_DEBUG_LOCK_ALLOC */ } static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, int has_srq) { if (cap->max_send_wr > rxe->attr.max_qp_wr) { rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n", cap->max_send_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_send_sge > rxe->attr.max_send_sge) { rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n", cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } if (!has_srq) { if (cap->max_recv_wr > rxe->attr.max_qp_wr) { rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n", cap->max_recv_wr, rxe->attr.max_qp_wr); goto err1; } if (cap->max_recv_sge > rxe->attr.max_recv_sge) { rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n", cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } if (cap->max_inline_data > rxe->max_inline_data) { rxe_dbg_dev(rxe, "invalid max inline data = %u > %d\n", cap->max_inline_data, rxe->max_inline_data); goto err1; } return 0; err1: return -EINVAL; } int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) { struct ib_qp_cap *cap = &init->cap; struct rxe_port *port; int port_num = init->port_num; switch (init->qp_type) { case IB_QPT_GSI: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: break; default: return -EOPNOTSUPP; } if (!init->recv_cq || !init->send_cq) { rxe_dbg_dev(rxe, "missing cq\n"); goto err1; } if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) goto err1; if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { rxe_dbg_dev(rxe, "invalid port = %d\n", port_num); goto err1; } port = &rxe->port; if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { rxe_dbg_dev(rxe, "GSI QP exists for port %d\n", port_num); goto err1; } } return 0; err1: return -EINVAL; } static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) { qp->resp.res_head = 0; qp->resp.res_tail = 0; qp->resp.resources = kzalloc_objs(struct resp_res, n); if (!qp->resp.resources) return -ENOMEM; return 0; } static void free_rd_atomic_resources(struct rxe_qp *qp) { if (qp->resp.resources) { int i; for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; free_rd_atomic_resource(res); } kfree(qp->resp.resources); qp->resp.resources = NULL; } } void free_rd_atomic_resource(struct resp_res *res) { res->type = 0; } static void cleanup_rd_atomic_resources(struct rxe_qp *qp) { int i; struct resp_res *res; if (qp->resp.resources) { for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { res = &qp->resp.resources[i]; free_rd_atomic_resource(res); } } } static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init) { struct rxe_port *port; u32 qpn; qp->sq_sig_type = init->sq_sig_type; qp->attr.path_mtu = 1; qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); qpn = qp->elem.index; port = &rxe->port; switch (init->qp_type) { case IB_QPT_GSI: qp->ibqp.qp_num = 1; port->qp_gsi_index = qpn; qp->attr.port_num = init->port_num; break; default: qp->ibqp.qp_num = qpn; break; } spin_lock_init(&qp->state_lock); spin_lock_init(&qp->sq.sq_lock); spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); skb_queue_head_init(&qp->req_pkts); skb_queue_head_init(&qp->resp_pkts); atomic_set(&qp->ssn, 0); atomic_set(&qp->skb_out, 0); } static int rxe_init_sq(struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int wqe_size; int err; qp->sq.max_wr = init->cap.max_send_wr; wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), init->cap.max_inline_data); qp->sq.max_sge = wqe_size / sizeof(struct ib_sge); qp->sq.max_inline = wqe_size; wqe_size += sizeof(struct rxe_send_wqe); qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size, QUEUE_TYPE_FROM_CLIENT); if (!qp->sq.queue) { rxe_err_qp(qp, "Unable to allocate send queue\n"); err = -ENOMEM; goto err_out; } /* prepare info for caller to mmap send queue if user space qp */ err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, qp->sq.queue->buf, qp->sq.queue->buf_size, &qp->sq.queue->ip); if (err) { rxe_err_qp(qp, "do_mmap_info failed, err = %d\n", err); goto err_free; } /* return actual capabilities to caller which may be larger * than requested */ init->cap.max_send_wr = qp->sq.max_wr; init->cap.max_send_sge = qp->sq.max_sge; init->cap.max_inline_data = qp->sq.max_inline; return 0; err_free: vfree(qp->sq.queue->buf); kfree(qp->sq.queue); qp->sq.queue = NULL; err_out: return err; } static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; /* if we don't finish qp create make sure queue is valid */ skb_queue_head_init(&qp->req_pkts); err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; rxe_reclassify_send_socket(qp->sk); qp->sk->sk->sk_user_data = qp; /* pick a source UDP port number for this QP based on * the source QPN. this spreads traffic for different QPs * across different NIC RX queues (while using a single * flow for a given QP to maintain packet order). * the port number must be in the Dynamic Ports range * (0xc000 - 0xffff). */ qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff); err = rxe_init_sq(qp, init, udata, uresp); if (err) return err; qp->req.wqe_index = queue_get_producer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); qp->req.opcode = -1; qp->comp.opcode = -1; rxe_init_task(&qp->send_task, qp, rxe_sender); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0); timer_setup(&qp->retrans_timer, retransmit_timer, 0); } return 0; } static int rxe_init_rq(struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int wqe_size; int err; qp->rq.max_wr = init->cap.max_recv_wr; qp->rq.max_sge = init->cap.max_recv_sge; wqe_size = sizeof(struct rxe_recv_wqe) + qp->rq.max_sge*sizeof(struct ib_sge); qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, QUEUE_TYPE_FROM_CLIENT); if (!qp->rq.queue) { rxe_err_qp(qp, "Unable to allocate recv queue\n"); err = -ENOMEM; goto err_out; } /* prepare info for caller to mmap recv queue if user space qp */ err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { rxe_err_qp(qp, "do_mmap_info failed, err = %d\n", err); goto err_free; } /* return actual capabilities to caller which may be larger * than requested */ init->cap.max_recv_wr = qp->rq.max_wr; return 0; err_free: vfree(qp->rq.queue->buf); kfree(qp->rq.queue); qp->rq.queue = NULL; err_out: return err; } static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; /* if we don't finish qp create make sure queue is valid */ skb_queue_head_init(&qp->resp_pkts); if (!qp->srq) { err = rxe_init_rq(qp, init, udata, uresp); if (err) return err; } rxe_init_task(&qp->recv_task, qp, rxe_receiver); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; return 0; } /* called by the create qp verb */ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct ib_qp_init_attr *init, struct rxe_create_qp_resp __user *uresp, struct ib_pd *ibpd, struct ib_udata *udata) { int err; struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; unsigned long flags; rxe_get(pd); rxe_get(rcq); rxe_get(scq); if (srq) rxe_get(srq); qp->pd = pd; qp->rcq = rcq; qp->scq = scq; qp->srq = srq; atomic_inc(&rcq->num_wq); atomic_inc(&scq->num_wq); rxe_qp_init_misc(rxe, qp, init); err = rxe_qp_init_req(rxe, qp, init, udata, uresp); if (err) goto err1; err = rxe_qp_init_resp(rxe, qp, init, udata, uresp); if (err) goto err2; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; spin_unlock_irqrestore(&qp->state_lock, flags); return 0; err2: rxe_queue_cleanup(qp->sq.queue); qp->sq.queue = NULL; err1: atomic_dec(&rcq->num_wq); atomic_dec(&scq->num_wq); qp->pd = NULL; qp->rcq = NULL; qp->scq = NULL; qp->srq = NULL; if (srq) rxe_put(srq); rxe_put(scq); rxe_put(rcq); rxe_put(pd); return err; } /* called by the query qp verb */ int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) { init->event_handler = qp->ibqp.event_handler; init->qp_context = qp->ibqp.qp_context; init->send_cq = qp->ibqp.send_cq; init->recv_cq = qp->ibqp.recv_cq; init->srq = qp->ibqp.srq; init->cap.max_send_wr = qp->sq.max_wr; init->cap.max_send_sge = qp->sq.max_sge; init->cap.max_inline_data = qp->sq.max_inline; if (!qp->srq) { init->cap.max_recv_wr = qp->rq.max_wr; init->cap.max_recv_sge = qp->rq.max_sge; } init->sq_sig_type = qp->sq_sig_type; init->qp_type = qp->ibqp.qp_type; init->port_num = 1; return 0; } int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { if (mask & IB_QP_PORT) { if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); goto err1; } } if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; if (mask & IB_QP_ACCESS_FLAGS) { if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)) goto err1; if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP) goto err1; } if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; if (mask & IB_QP_ALT_PATH) { if (rxe_av_chk_attr(qp, &attr->alt_ah_attr)) goto err1; if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); goto err1; } if (attr->alt_timeout > 31) { rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n", attr->alt_timeout); goto err1; } } if (mask & IB_QP_PATH_MTU) { struct rxe_port *port = &rxe->port; enum ib_mtu max_mtu = port->attr.max_mtu; enum ib_mtu mtu = attr->path_mtu; if (mtu > max_mtu) { rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n", ib_mtu_enum_to_int(mtu), ib_mtu_enum_to_int(max_mtu)); goto err1; } } if (mask & IB_QP_MAX_QP_RD_ATOMIC) { if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n", attr->max_rd_atomic, rxe->attr.max_qp_rd_atom); goto err1; } } if (mask & IB_QP_TIMEOUT) { if (attr->timeout > 31) { rxe_dbg_qp(qp, "invalid timeout %d > 31\n", attr->timeout); goto err1; } } return 0; err1: return -EINVAL; } /* move the qp to the reset state */ static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ rxe_disable_task(&qp->recv_task); rxe_disable_task(&qp->send_task); /* drain work and packet queuesc */ rxe_sender(qp); rxe_receiver(qp); if (qp->rq.queue) rxe_queue_reset(qp->rq.queue); if (qp->sq.queue) rxe_queue_reset(qp->sq.queue); /* cleanup attributes */ atomic_set(&qp->ssn, 0); qp->req.opcode = -1; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; qp->req.noack_pkts = 0; qp->resp.msn = 0; qp->resp.opcode = -1; qp->resp.drop_msg = 0; qp->resp.goto_error = 0; qp->resp.sent_psn_nak = 0; if (qp->resp.mr) { rxe_put(qp->resp.mr); qp->resp.mr = NULL; } cleanup_rd_atomic_resources(qp); /* reenable tasks */ rxe_enable_task(&qp->recv_task); rxe_enable_task(&qp->send_task); } /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->recv_task); rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } /* caller should hold qp->state_lock */ static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { enum ib_qp_state cur_state; enum ib_qp_state new_state; cur_state = (mask & IB_QP_CUR_STATE) ? attr->cur_qp_state : qp->attr.qp_state; new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) return -EINVAL; if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { if (qp->attr.sq_draining && new_state != IB_QPS_ERR) return -EINVAL; } return 0; } static const char *const qps2str[] = { [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", [IB_QPS_ERR] = "ERR", }; /* called by the modify qp verb */ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { int err; if (mask & IB_QP_CUR_STATE) qp->attr.cur_qp_state = attr->qp_state; if (mask & IB_QP_STATE) { unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); err = __qp_chk_state(qp, attr, mask); if (!err) { qp->attr.qp_state = attr->qp_state; rxe_dbg_qp(qp, "state -> %s\n", qps2str[attr->qp_state]); } spin_unlock_irqrestore(&qp->state_lock, flags); if (err) return err; switch (attr->qp_state) { case IB_QPS_RESET: rxe_qp_reset(qp); break; case IB_QPS_SQD: rxe_qp_sqd(qp, attr, mask); break; case IB_QPS_ERR: rxe_qp_error(qp); break; default: break; } } if (mask & IB_QP_MAX_QP_RD_ATOMIC) { int max_rd_atomic = attr->max_rd_atomic ? roundup_pow_of_two(attr->max_rd_atomic) : 0; qp->attr.max_rd_atomic = max_rd_atomic; atomic_set(&qp->req.rd_atomic, max_rd_atomic); } if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { int max_dest_rd_atomic = attr->max_dest_rd_atomic ? roundup_pow_of_two(attr->max_dest_rd_atomic) : 0; qp->attr.max_dest_rd_atomic = max_dest_rd_atomic; free_rd_atomic_resources(qp); err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic); if (err) return err; } if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; if (mask & IB_QP_ACCESS_FLAGS) qp->attr.qp_access_flags = attr->qp_access_flags; if (mask & IB_QP_PKEY_INDEX) qp->attr.pkey_index = attr->pkey_index; if (mask & IB_QP_PORT) qp->attr.port_num = attr->port_num; if (mask & IB_QP_QKEY) qp->attr.qkey = attr->qkey; if (mask & IB_QP_AV) rxe_init_av(&attr->ah_attr, &qp->pri_av); if (mask & IB_QP_ALT_PATH) { rxe_init_av(&attr->alt_ah_attr, &qp->alt_av); qp->attr.alt_port_num = attr->alt_port_num; qp->attr.alt_pkey_index = attr->alt_pkey_index; qp->attr.alt_timeout = attr->alt_timeout; } if (mask & IB_QP_PATH_MTU) { qp->attr.path_mtu = attr->path_mtu; qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); } if (mask & IB_QP_TIMEOUT) { qp->attr.timeout = attr->timeout; if (attr->timeout == 0) { qp->qp_timeout_jiffies = 0; } else { /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ int j = nsecs_to_jiffies(4096ULL << attr->timeout); qp->qp_timeout_jiffies = j ? j : 1; } } if (mask & IB_QP_RETRY_CNT) { qp->attr.retry_cnt = attr->retry_cnt; qp->comp.retry_cnt = attr->retry_cnt; rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt); } if (mask & IB_QP_RNR_RETRY) { qp->attr.rnr_retry = attr->rnr_retry; qp->comp.rnr_retry = attr->rnr_retry; rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry); } if (mask & IB_QP_RQ_PSN) { qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); qp->resp.psn = qp->attr.rq_psn; rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn); } if (mask & IB_QP_MIN_RNR_TIMER) { qp->attr.min_rnr_timer = attr->min_rnr_timer; rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n", attr->min_rnr_timer); } if (mask & IB_QP_SQ_PSN) { qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); qp->req.psn = qp->attr.sq_psn; qp->comp.psn = qp->attr.sq_psn; rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn); } if (mask & IB_QP_PATH_MIG_STATE) qp->attr.path_mig_state = attr->path_mig_state; if (mask & IB_QP_DEST_QPN) qp->attr.dest_qp_num = attr->dest_qp_num; return 0; } /* called by the query qp verb */ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { unsigned long flags; *attr = qp->attr; attr->rq_psn = qp->resp.psn; attr->sq_psn = qp->req.psn; attr->cap.max_send_wr = qp->sq.max_wr; attr->cap.max_send_sge = qp->sq.max_sge; attr->cap.max_inline_data = qp->sq.max_inline; if (!qp->srq) { attr->cap.max_recv_wr = qp->rq.max_wr; attr->cap.max_recv_sge = qp->rq.max_sge; } rxe_av_to_attr(&qp->pri_av, &attr->ah_attr); rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr); /* Applications that get this state typically spin on it. * Yield the processor */ spin_lock_irqsave(&qp->state_lock, flags); attr->cur_qp_state = qp_state(qp); if (qp->attr.sq_draining) { spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); } else { spin_unlock_irqrestore(&qp->state_lock, flags); } return 0; } int rxe_qp_chk_destroy(struct rxe_qp *qp) { /* See IBA o10-2.2.3 * An attempt to destroy a QP while attached to a mcast group * will fail immediately. */ if (atomic_read(&qp->mcg_num)) { rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n"); return -EBUSY; } return 0; } /* called when the last reference to the qp is dropped */ static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); qp->valid = 0; spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; /* In the function timer_setup, .function is initialized. If .function * is NULL, it indicates the function timer_setup is not called, the * timer is not initialized. Or else, the timer is initialized. */ if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && qp->rnr_nak_timer.function) { timer_delete_sync(&qp->retrans_timer); timer_delete_sync(&qp->rnr_nak_timer); } if (qp->recv_task.func) rxe_cleanup_task(&qp->recv_task); if (qp->send_task.func) rxe_cleanup_task(&qp->send_task); /* flush out any receive wr's or pending requests */ rxe_sender(qp); rxe_receiver(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); if (qp->srq) rxe_put(qp->srq); if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); if (qp->scq) { atomic_dec(&qp->scq->num_wq); rxe_put(qp->scq); } if (qp->rcq) { atomic_dec(&qp->rcq->num_wq); rxe_put(qp->rcq); } if (qp->pd) rxe_put(qp->pd); if (qp->resp.mr) rxe_put(qp->resp.mr); free_rd_atomic_resources(qp); if (qp->sk) { if (qp_type(qp) == IB_QPT_RC) sk_dst_reset(qp->sk->sk); kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } } /* called when the last reference to the qp is dropped */ void rxe_qp_cleanup(struct rxe_pool_elem *elem) { struct rxe_qp *qp = container_of(elem, typeof(*qp), elem); execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 // SPDX-License-Identifier: GPL-2.0-only /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (scsi_status_is_check_condition(cmd->result)) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev, cmd); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /* * 4096 is big enough for saturating fast SCSI LUNs. */ int scsi_device_max_queue_depth(struct scsi_device *sdev) { return min_t(int, sdev->host->can_queue, 4096); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { if (!sdev->budget_map.map) return -EINVAL; depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: * * 0 - No change needed * * >0 - Adjust queue depth to this new depth, * * -1 - Drop back to untagged operation using host->cmd_per_lun as the * untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { if (!sdev->budget_map.map) return 0; /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, 30 * HZ, 3, NULL); if (result) return -EIO; /* * Sanity check that we got the page back that we asked for and that * the page size is not 0. */ if (buffer[1] != page) return -EIO; result = get_unaligned_be16(&buffer[2]); if (!result) return -EIO; return result + 4; } enum scsi_vpd_parameters { SCSI_VPD_HEADER_SIZE = 4, SCSI_VPD_LIST_SIZE = 36, }; static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; /* * Fetch the supported pages VPD and validate that the requested page * number is present. */ if (page != 0) { result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); if (result < SCSI_VPD_HEADER_SIZE) return 0; if (result > sizeof(vpd)) { dev_warn_once(&sdev->sdev_gendev, "%s: long VPD page 0 length: %d bytes\n", __func__, result); result = sizeof(vpd); } result -= SCSI_VPD_HEADER_SIZE; if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) return 0; } /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); if (result < 0) return 0; if (result < SCSI_VPD_HEADER_SIZE) { dev_warn_once(&sdev->sdev_gendev, "%s: short VPD page 0x%02x length: %d bytes\n", __func__, page, result); return 0; } return result; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine fills @buf * with the data from that page and return 0. If the VPD page is not * supported or its content cannot be retrieved, -EINVAL is returned. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int result, vpd_len; if (!scsi_device_supports_vpd(sdev)) return -EINVAL; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return -EINVAL; vpd_len = min(vpd_len, buf_len); /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ memset(buf, 0, buf_len); result = scsi_vpd_inquiry(sdev, buf, page, vpd_len); if (result < 0) return -EINVAL; else if (result > vpd_len) dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); return 0; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len, result; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return NULL; retry_pg: /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { switch (vpd_buf->data[i]) { case 0x0: scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0); break; case 0x80: scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); break; case 0x83: scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); break; case 0x89: scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89); break; case 0xb0: scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0); break; case 0xb1: scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); break; case 0xb2: scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); break; case 0xb7: scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7); break; default: break; } } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for the command to look up * @sa: service action for the command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to check support for the * command identified with @opcode and @sa. If the command does not * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails, * 0 if the command is not supported and 1 if the device claims to * support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result, request_len; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; /* RSOC header + size of command we are asking about */ request_len = 4 + COMMAND_SIZE(opcode); if (request_len > len) { dev_warn_once(&sdev->sdev_gendev, "%s: len %u bytes, opcode 0x%02x needs %u\n", __func__, len, opcode, request_len); return -EINVAL; } memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; if (!sa) { cmd[2] = 1; /* One command format */ cmd[3] = opcode; } else { cmd[2] = 3; /* One command format with service action */ cmd[3] = opcode; put_unaligned_be16(sa, &cmd[4]); } put_unaligned_be32(request_len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, request_len, 30 * HZ, 3, &exec_args); if (result < 0) return result; if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); #define SCSI_CDL_CHECK_BUF_LEN 64 static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa, unsigned char *buf) { int ret; u8 cdlp; /* Check operation code */ ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa); if (ret <= 0) return false; if ((buf[1] & 0x03) != 0x03) return false; /* * See SPC-6, One_command parameter data format for * REPORT SUPPORTED OPERATION CODES. We have the following cases * depending on rwcdlp (buf[0] & 0x01) value: * - rwcdlp == 0: then cdlp indicates support for the A mode page when * it is equal to 1 and for the B mode page when it is * equal to 2. * - rwcdlp == 1: then cdlp indicates support for the T2A mode page * when it is equal to 1 and for the T2B mode page when * it is equal to 2. * Overall, to detect support for command duration limits, we only need * to check that cdlp is 1 or 2. */ cdlp = (buf[1] & 0x18) >> 3; return cdlp == 0x01 || cdlp == 0x02; } /** * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits * @sdev: The device to check */ void scsi_cdl_check(struct scsi_device *sdev) { bool cdl_supported; unsigned char *buf; /* * Support for CDL was defined in SPC-5. Ignore devices reporting an * lower SPC version. This also avoids problems with old drives choking * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a * service action specified, as done in scsi_cdl_check_cmd(). */ if (sdev->scsi_level < SCSI_SPC_5) { sdev->cdl_supported = 0; return; } buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); if (!buf) { sdev->cdl_supported = 0; return; } /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */ cdl_supported = scsi_cdl_check_cmd(sdev, READ_16, 0, buf) || scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf); if (cdl_supported) { /* * We have CDL support: force the use of READ16/WRITE16. * READ32 and WRITE32 will be used for devices that support * the T10_PI_TYPE2_PROTECTION protection type. */ sdev->use_16_for_rw = 1; sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; /* * If the device supports CDL, make sure that the current drive * feature status is consistent with the user controlled * cdl_enable state. */ scsi_cdl_enable(sdev, sdev->cdl_enable); } else { sdev->cdl_supported = 0; } kfree(buf); } /** * scsi_cdl_enable - Enable or disable a SCSI device supports for Command * Duration Limits * @sdev: The target device * @enable: the target state */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { char buf[64]; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (sdev->is_ata) { struct scsi_mode_data data; struct scsi_sense_hdr sshdr; char *buf_data; int len; ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf), 5 * HZ, 3, &data, NULL); if (ret) return -EINVAL; /* Enable or disable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; /* * If we want to enable CDL and CDL is already enabled on the * device, do nothing. This avoids needlessly resetting the CDL * statistics on the device as that is implied by the CDL enable * action. Similar to this, there is no need to do anything if * we want to disable CDL and CDL is already disabled. */ if (enable) { if ((buf_data[4] & 0x03) == 0x02) goto out; buf_data[4] &= ~0x03; buf_data[4] |= 0x02; } else { if ((buf_data[4] & 0x03) == 0x00) goto out; buf_data[4] &= ~0x03; } ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) scsi_print_sense_hdr(sdev, dev_name(&sdev->sdev_gendev), &sshdr); return ret; } } out: sdev->cdl_enable = enable; return 0; } /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail_put_module; return 0; fail_put_module: module_put(sdev->host->hostt->module); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* * Skip pseudo devices and also devices we can't get a * reference to. */ if (!scsi_device_is_pseudo_dev(next) && !scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); static int __init init_scsi(void) { int error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); } subsys_initcall(init_scsi); module_exit(exit_scsi);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli */ #include "tp_meter.h" #include "main.h" #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/kthread.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/param.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "send.h" /** * BATADV_TP_DEF_TEST_LENGTH - Default test length if not specified by the user * in milliseconds */ #define BATADV_TP_DEF_TEST_LENGTH 10000 /** * BATADV_TP_AWND - Advertised window by the receiver (in bytes) */ #define BATADV_TP_AWND 0x20000000 /** * BATADV_TP_RECV_TIMEOUT - Receiver activity timeout. If the receiver does not * get anything for such amount of milliseconds, the connection is killed */ #define BATADV_TP_RECV_TIMEOUT 1000 /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 /** * BATADV_TP_FIRST_SEQ - First seqno of each session. The number is rather high * in order to immediately trigger a wrap around (test purposes) */ #define BATADV_TP_FIRST_SEQ ((u32)-1 - 2000) /** * BATADV_TP_PLEN - length of the payload (data after the batadv_unicast header) * to simulate */ #define BATADV_TP_PLEN (BATADV_TP_PACKET_LEN - ETH_HLEN - \ sizeof(struct batadv_unicast_packet)) static u8 batadv_tp_prerandom[4096] __read_mostly; /** * batadv_tp_session_cookie() - generate session cookie based on session ids * @session: TP session identifier * @icmp_uid: icmp pseudo uid of the tp session * * Return: 32 bit tp_meter session cookie */ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) { u32 cookie; cookie = icmp_uid << 16; cookie |= session[0] << 8; cookie |= session[1]; return cookie; } /** * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size * @min: minimum cwnd value (usually MSS) * * Return the new cwnd size and ensure it does not exceed the Advertised * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes */ static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min) { u32 new_size = base + increment; /* check for wrap-around */ if (new_size < base) new_size = (u32)ULONG_MAX; new_size = min_t(u32, new_size, BATADV_TP_AWND); return max_t(u32, new_size, min); } /** * batadv_tp_update_cwnd() - update the Congestion Windows * @tp_vars: the private data of the current TP meter session * @mss: maximum segment size of transmission * * 1) if the session is in Slow Start, the CWND has to be increased by 1 * MSS every unique received ACK * 2) if the session is in Congestion Avoidance, the CWND has to be * increased by MSS * MSS / CWND for every unique received ACK */ static void batadv_tp_update_cwnd(struct batadv_tp_vars *tp_vars, u32 mss) { spin_lock_bh(&tp_vars->cwnd_lock); /* slow start... */ if (tp_vars->cwnd <= tp_vars->ss_threshold) { tp_vars->dec_cwnd = 0; tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); spin_unlock_bh(&tp_vars->cwnd_lock); return; } /* increment CWND at least of 1 (section 3.1 of RFC5681) */ tp_vars->dec_cwnd += max_t(u32, 1U << 3, ((mss * mss) << 6) / (tp_vars->cwnd << 3)); if (tp_vars->dec_cwnd < (mss << 3)) { spin_unlock_bh(&tp_vars->cwnd_lock); return; } tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); tp_vars->dec_cwnd = 0; spin_unlock_bh(&tp_vars->cwnd_lock); } /** * batadv_tp_update_rto() - calculate new retransmission timeout * @tp_vars: the private data of the current TP meter session * @new_rtt: new roundtrip time in msec */ static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars, u32 new_rtt) { long m = new_rtt; /* RTT update * Details in Section 2.2 and 2.3 of RFC6298 * * It's tricky to understand. Don't lose hair please. * Inspired by tcp_rtt_estimator() tcp_input.c */ if (tp_vars->srtt != 0) { m -= (tp_vars->srtt >> 3); /* m is now error in rtt est */ tp_vars->srtt += m; /* rtt = 7/8 srtt + 1/8 new */ if (m < 0) m = -m; m -= (tp_vars->rttvar >> 2); tp_vars->rttvar += m; /* mdev ~= 3/4 rttvar + 1/4 new */ } else { /* first measure getting in */ tp_vars->srtt = m << 3; /* take the measured time to be srtt */ tp_vars->rttvar = m << 1; /* new_rtt / 2 */ } /* rto = srtt + 4 * rttvar. * rttvar is scaled by 4, therefore doesn't need to be multiplied */ tp_vars->rto = (tp_vars->srtt >> 3) + tp_vars->rttvar; } /** * batadv_tp_batctl_notify() - send client status result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session * @bat_priv: the bat priv with all the mesh interface information * @start_time: start of transmission in jiffies * @total_sent: bytes acked to the receiver * @cookie: cookie of tp_meter session */ static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason, const u8 *dst, struct batadv_priv *bat_priv, unsigned long start_time, u64 total_sent, u32 cookie) { u32 test_time; u8 result; u32 total_bytes; if (!batadv_tp_is_error(reason)) { result = BATADV_TP_REASON_COMPLETE; test_time = jiffies_to_msecs(jiffies - start_time); total_bytes = total_sent; } else { result = reason; test_time = 0; total_bytes = 0; } batadv_netlink_tpmeter_notify(bat_priv, dst, result, test_time, total_bytes, cookie); } /** * batadv_tp_batctl_error_notify() - send client error result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session * @bat_priv: the bat priv with all the mesh interface information * @cookie: cookie of tp_meter session */ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, const u8 *dst, struct batadv_priv *bat_priv, u32 cookie) { batadv_tp_batctl_notify(reason, dst, bat_priv, 0, 0, cookie); } /** * batadv_tp_list_find() - find a tp_vars object in the global list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, const u8 *dst) { struct batadv_tp_vars *pos, *tp_vars = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { if (!batadv_compare_eth(pos->other_end, dst)) continue; /* most of the time this function is invoked during the normal * process..it makes sens to pay more when the session is * finished and to speed the process up during the measurement */ if (unlikely(!kref_get_unless_zero(&pos->refcount))) continue; tp_vars = pos; break; } rcu_read_unlock(); return tp_vars; } /** * batadv_tp_list_find_session() - find tp_vars session object in the global * list * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found */ static struct batadv_tp_vars * batadv_tp_list_find_session(struct batadv_priv *bat_priv, const u8 *dst, const u8 *session) { struct batadv_tp_vars *pos, *tp_vars = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(pos, &bat_priv->tp_list, list) { if (!batadv_compare_eth(pos->other_end, dst)) continue; if (memcmp(pos->session, session, sizeof(pos->session)) != 0) continue; /* most of the time this function is invoked during the normal * process..it makes sense to pay more when the session is * finished and to speed the process up during the measurement */ if (unlikely(!kref_get_unless_zero(&pos->refcount))) continue; tp_vars = pos; break; } rcu_read_unlock(); return tp_vars; } /** * batadv_tp_vars_release() - release batadv_tp_vars from lists and queue for * free after rcu grace period * @ref: kref pointer of the batadv_tp_vars */ static void batadv_tp_vars_release(struct kref *ref) { struct batadv_tp_vars *tp_vars; struct batadv_tp_unacked *un, *safe; tp_vars = container_of(ref, struct batadv_tp_vars, refcount); /* lock should not be needed because this object is now out of any * context! */ spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { list_del(&un->list); kfree(un); } spin_unlock_bh(&tp_vars->unacked_lock); kfree_rcu(tp_vars, rcu); } /** * batadv_tp_vars_put() - decrement the batadv_tp_vars refcounter and possibly * release it * @tp_vars: the private data of the current TP meter session to be free'd */ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) { if (!tp_vars) return; kref_put(&tp_vars->refcount, batadv_tp_vars_release); } /** * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer * @bat_priv: the bat priv with all the mesh interface information * @tp_vars: the private data of the current TP meter session to cleanup */ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, struct batadv_tp_vars *tp_vars) { cancel_delayed_work(&tp_vars->finish_work); spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); hlist_del_rcu(&tp_vars->list); spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); /* drop list reference */ batadv_tp_vars_put(tp_vars); atomic_dec(&tp_vars->bat_priv->tp_num); /* kill the timer and remove its reference */ timer_delete_sync(&tp_vars->timer); /* the worker might have rearmed itself therefore we kill it again. Note * that if the worker should run again before invoking the following * timer_delete(), it would not re-arm itself once again because the status * is OFF now */ timer_delete(&tp_vars->timer); batadv_tp_vars_put(tp_vars); } /** * batadv_tp_sender_end() - print info about ended session and inform client * @bat_priv: the bat priv with all the mesh interface information * @tp_vars: the private data of the current TP meter session */ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, struct batadv_tp_vars *tp_vars) { u32 session_cookie; batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Test towards %pM finished..shutting down (reason=%d)\n", tp_vars->other_end, tp_vars->reason); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Last timing stats: SRTT=%ums RTTVAR=%ums RTO=%ums\n", tp_vars->srtt >> 3, tp_vars->rttvar >> 2, tp_vars->rto); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Final values: cwnd=%u ss_threshold=%u\n", tp_vars->cwnd, tp_vars->ss_threshold); session_cookie = batadv_tp_session_cookie(tp_vars->session, tp_vars->icmp_uid); batadv_tp_batctl_notify(tp_vars->reason, tp_vars->other_end, bat_priv, tp_vars->start_time, atomic64_read(&tp_vars->tot_sent), session_cookie); } /** * batadv_tp_sender_shutdown() - let sender thread/timer stop gracefully * @tp_vars: the private data of the current TP meter session * @reason: reason for tp meter session stop */ static void batadv_tp_sender_shutdown(struct batadv_tp_vars *tp_vars, enum batadv_tp_meter_reason reason) { if (!atomic_dec_and_test(&tp_vars->sending)) return; tp_vars->reason = reason; } /** * batadv_tp_sender_finish() - stop sender session after test_length was reached * @work: delayed work reference of the related tp_vars */ static void batadv_tp_sender_finish(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_tp_vars *tp_vars; delayed_work = to_delayed_work(work); tp_vars = container_of(delayed_work, struct batadv_tp_vars, finish_work); batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_COMPLETE); } /** * batadv_tp_reset_sender_timer() - reschedule the sender timer * @tp_vars: the private TP meter data for this session * * Reschedule the timer using tp_vars->rto as delay */ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) { /* most of the time this function is invoked while normal packet * reception... */ if (unlikely(atomic_read(&tp_vars->sending) == 0)) /* timer ref will be dropped in batadv_tp_sender_cleanup */ return; mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(tp_vars->rto)); } /** * batadv_tp_sender_timeout() - timer that fires in case of packet loss * @t: address to timer_list inside tp_vars * * If fired it means that there was packet loss. * Switch to Slow Start, set the ss_threshold to half of the current cwnd and * reset the cwnd to 3*MSS */ static void batadv_tp_sender_timeout(struct timer_list *t) { struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_priv *bat_priv = tp_vars->bat_priv; if (atomic_read(&tp_vars->sending) == 0) return; /* if the user waited long enough...shutdown the test */ if (unlikely(tp_vars->rto >= BATADV_TP_MAX_RTO)) { batadv_tp_sender_shutdown(tp_vars, BATADV_TP_REASON_DST_UNREACHABLE); return; } /* RTO exponential backoff * Details in Section 5.5 of RFC6298 */ tp_vars->rto <<= 1; spin_lock_bh(&tp_vars->cwnd_lock); tp_vars->ss_threshold = tp_vars->cwnd >> 1; if (tp_vars->ss_threshold < BATADV_TP_PLEN * 2) tp_vars->ss_threshold = BATADV_TP_PLEN * 2; batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: RTO fired during test towards %pM! cwnd=%u new ss_thr=%u, resetting last_sent to %u\n", tp_vars->other_end, tp_vars->cwnd, tp_vars->ss_threshold, atomic_read(&tp_vars->last_acked)); tp_vars->cwnd = BATADV_TP_PLEN * 3; spin_unlock_bh(&tp_vars->cwnd_lock); /* resend the non-ACKed packets.. */ tp_vars->last_sent = atomic_read(&tp_vars->last_acked); wake_up(&tp_vars->more_bytes); batadv_tp_reset_sender_timer(tp_vars); } /** * batadv_tp_fill_prerandom() - Fill buffer with prefetched random bytes * @tp_vars: the private TP meter data for this session * @buf: Buffer to fill with bytes * @nbytes: amount of pseudorandom bytes */ static void batadv_tp_fill_prerandom(struct batadv_tp_vars *tp_vars, u8 *buf, size_t nbytes) { u32 local_offset; size_t bytes_inbuf; size_t to_copy; size_t pos = 0; spin_lock_bh(&tp_vars->prerandom_lock); local_offset = tp_vars->prerandom_offset; tp_vars->prerandom_offset += nbytes; tp_vars->prerandom_offset %= sizeof(batadv_tp_prerandom); spin_unlock_bh(&tp_vars->prerandom_lock); while (nbytes) { local_offset %= sizeof(batadv_tp_prerandom); bytes_inbuf = sizeof(batadv_tp_prerandom) - local_offset; to_copy = min(nbytes, bytes_inbuf); memcpy(&buf[pos], &batadv_tp_prerandom[local_offset], to_copy); pos += to_copy; nbytes -= to_copy; local_offset = 0; } } /** * batadv_tp_send_msg() - send a single message * @tp_vars: the private TP meter data for this session * @src: source mac address * @orig_node: the originator of the destination * @seqno: sequence number of this packet * @len: length of the entire packet * @session: session identifier * @uid: local ICMP "socket" index * @timestamp: timestamp in jiffies which is replied in ack * * Create and send a single TP Meter message. * * Return: 0 on success, BATADV_TP_REASON_DST_UNREACHABLE if the destination is * not reachable, BATADV_TP_REASON_MEMORY_ERROR if the packet couldn't be * allocated */ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src, struct batadv_orig_node *orig_node, u32 seqno, size_t len, const u8 *session, int uid, u32 timestamp) { struct batadv_icmp_tp_packet *icmp; struct sk_buff *skb; int r; u8 *data; size_t data_len; skb = netdev_alloc_skb_ip_align(NULL, len + ETH_HLEN); if (unlikely(!skb)) return BATADV_TP_REASON_MEMORY_ERROR; skb_reserve(skb, ETH_HLEN); icmp = skb_put(skb, sizeof(*icmp)); /* fill the icmp header */ ether_addr_copy(icmp->dst, orig_node->orig); ether_addr_copy(icmp->orig, src); icmp->version = BATADV_COMPAT_VERSION; icmp->packet_type = BATADV_ICMP; icmp->ttl = BATADV_TTL; icmp->msg_type = BATADV_TP; icmp->uid = uid; icmp->subtype = BATADV_TP_MSG; memcpy(icmp->session, session, sizeof(icmp->session)); icmp->seqno = htonl(seqno); icmp->timestamp = htonl(timestamp); data_len = len - sizeof(*icmp); data = skb_put(skb, data_len); batadv_tp_fill_prerandom(tp_vars, data, data_len); r = batadv_send_skb_to_orig(skb, orig_node, NULL); if (r == NET_XMIT_SUCCESS) return 0; return BATADV_TP_REASON_CANT_SEND; } /** * batadv_tp_recv_ack() - ACK receiving function * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet * * Process a received TP ACK packet */ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, const struct sk_buff *skb) { struct batadv_hard_iface *primary_if = NULL; struct batadv_orig_node *orig_node = NULL; const struct batadv_icmp_tp_packet *icmp; struct batadv_tp_vars *tp_vars; const unsigned char *dev_addr; size_t packet_len, mss; u32 rtt, recv_ack, cwnd; packet_len = BATADV_TP_PLEN; mss = BATADV_TP_PLEN; packet_len += sizeof(struct batadv_unicast_packet); icmp = (struct batadv_icmp_tp_packet *)skb->data; /* find the tp_vars */ tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, icmp->session); if (unlikely(!tp_vars)) return; if (unlikely(atomic_read(&tp_vars->sending) == 0)) goto out; /* old ACK? silently drop it.. */ if (batadv_seq_before(ntohl(icmp->seqno), (u32)atomic_read(&tp_vars->last_acked))) goto out; primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) goto out; orig_node = batadv_orig_hash_find(bat_priv, icmp->orig); if (unlikely(!orig_node)) goto out; /* update RTO with the new sampled RTT, if any */ rtt = jiffies_to_msecs(jiffies) - ntohl(icmp->timestamp); if (icmp->timestamp && rtt) batadv_tp_update_rto(tp_vars, rtt); /* ACK for new data... reset the timer */ batadv_tp_reset_sender_timer(tp_vars); recv_ack = ntohl(icmp->seqno); /* check if this ACK is a duplicate */ if (atomic_read(&tp_vars->last_acked) == recv_ack) { atomic_inc(&tp_vars->dup_acks); if (atomic_read(&tp_vars->dup_acks) != 3) goto out; if (recv_ack >= tp_vars->recover) goto out; /* if this is the third duplicate ACK do Fast Retransmit */ batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, orig_node, recv_ack, packet_len, icmp->session, icmp->uid, jiffies_to_msecs(jiffies)); spin_lock_bh(&tp_vars->cwnd_lock); /* Fast Recovery */ tp_vars->fast_recovery = true; /* Set recover to the last outstanding seqno when Fast Recovery * is entered. RFC6582, Section 3.2, step 1 */ tp_vars->recover = tp_vars->last_sent; tp_vars->ss_threshold = tp_vars->cwnd >> 1; batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: Fast Recovery, (cur cwnd=%u) ss_thr=%u last_sent=%u recv_ack=%u\n", tp_vars->cwnd, tp_vars->ss_threshold, tp_vars->last_sent, recv_ack); tp_vars->cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 3 * mss, mss); tp_vars->dec_cwnd = 0; tp_vars->last_sent = recv_ack; spin_unlock_bh(&tp_vars->cwnd_lock); } else { /* count the acked data */ atomic64_add(recv_ack - atomic_read(&tp_vars->last_acked), &tp_vars->tot_sent); /* reset the duplicate ACKs counter */ atomic_set(&tp_vars->dup_acks, 0); if (tp_vars->fast_recovery) { /* partial ACK */ if (batadv_seq_before(recv_ack, tp_vars->recover)) { /* this is another hole in the window. React * immediately as specified by NewReno (see * Section 3.2 of RFC6582 for details) */ dev_addr = primary_if->net_dev->dev_addr; batadv_tp_send_msg(tp_vars, dev_addr, orig_node, recv_ack, packet_len, icmp->session, icmp->uid, jiffies_to_msecs(jiffies)); tp_vars->cwnd = batadv_tp_cwnd(tp_vars->cwnd, mss, mss); } else { tp_vars->fast_recovery = false; /* set cwnd to the value of ss_threshold at the * moment that Fast Recovery was entered. * RFC6582, Section 3.2, step 3 */ cwnd = batadv_tp_cwnd(tp_vars->ss_threshold, 0, mss); tp_vars->cwnd = cwnd; } goto move_twnd; } if (recv_ack - atomic_read(&tp_vars->last_acked) >= mss) batadv_tp_update_cwnd(tp_vars, mss); move_twnd: /* move the Transmit Window */ atomic_set(&tp_vars->last_acked, recv_ack); } wake_up(&tp_vars->more_bytes); out: batadv_hardif_put(primary_if); batadv_orig_node_put(orig_node); batadv_tp_vars_put(tp_vars); } /** * batadv_tp_avail() - check if congestion window is not full * @tp_vars: the private data of the current TP meter session * @payload_len: size of the payload of a single message * * Return: true when congestion window is not full, false otherwise */ static bool batadv_tp_avail(struct batadv_tp_vars *tp_vars, size_t payload_len) { u32 win_left, win_limit; win_limit = atomic_read(&tp_vars->last_acked) + tp_vars->cwnd; win_left = win_limit - tp_vars->last_sent; return win_left >= payload_len; } /** * batadv_tp_wait_available() - wait until congestion window becomes free or * timeout is reached * @tp_vars: the private data of the current TP meter session * @plen: size of the payload of a single message * * Return: 0 if the condition evaluated to false after the timeout elapsed, * 1 if the condition evaluated to true after the timeout elapsed, the * remaining jiffies (at least 1) if the condition evaluated to true before * the timeout elapsed, or -ERESTARTSYS if it was interrupted by a signal. */ static int batadv_tp_wait_available(struct batadv_tp_vars *tp_vars, size_t plen) { int ret; ret = wait_event_interruptible_timeout(tp_vars->more_bytes, batadv_tp_avail(tp_vars, plen), HZ / 10); return ret; } /** * batadv_tp_send() - main sending thread of a tp meter session * @arg: address of the related tp_vars * * Return: nothing, this function never returns */ static int batadv_tp_send(void *arg) { struct batadv_tp_vars *tp_vars = arg; struct batadv_priv *bat_priv = tp_vars->bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_orig_node *orig_node = NULL; size_t payload_len, packet_len; int err = 0; if (unlikely(tp_vars->role != BATADV_TP_SENDER)) { err = BATADV_TP_REASON_DST_UNREACHABLE; tp_vars->reason = err; goto out; } orig_node = batadv_orig_hash_find(bat_priv, tp_vars->other_end); if (unlikely(!orig_node)) { err = BATADV_TP_REASON_DST_UNREACHABLE; tp_vars->reason = err; goto out; } primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) { err = BATADV_TP_REASON_DST_UNREACHABLE; tp_vars->reason = err; goto out; } /* assume that all the hard_interfaces have a correctly * configured MTU, so use the mesh_iface MTU as MSS. * This might not be true and in that case the fragmentation * should be used. * Now, try to send the packet as it is */ payload_len = BATADV_TP_PLEN; BUILD_BUG_ON(sizeof(struct batadv_icmp_tp_packet) > BATADV_TP_PLEN); batadv_tp_reset_sender_timer(tp_vars); /* queue the worker in charge of terminating the test */ queue_delayed_work(batadv_event_workqueue, &tp_vars->finish_work, msecs_to_jiffies(tp_vars->test_length)); while (atomic_read(&tp_vars->sending) != 0) { if (unlikely(!batadv_tp_avail(tp_vars, payload_len))) { batadv_tp_wait_available(tp_vars, payload_len); continue; } /* to emulate normal unicast traffic, add to the payload len * the size of the unicast header */ packet_len = payload_len + sizeof(struct batadv_unicast_packet); err = batadv_tp_send_msg(tp_vars, primary_if->net_dev->dev_addr, orig_node, tp_vars->last_sent, packet_len, tp_vars->session, tp_vars->icmp_uid, jiffies_to_msecs(jiffies)); /* something went wrong during the preparation/transmission */ if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s() cannot send packets (%d)\n", __func__, err); /* ensure nobody else tries to stop the thread now */ if (atomic_dec_and_test(&tp_vars->sending)) tp_vars->reason = err; break; } /* right-shift the TWND */ if (!err) tp_vars->last_sent += payload_len; cond_resched(); } out: batadv_hardif_put(primary_if); batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); batadv_tp_sender_cleanup(bat_priv, tp_vars); batadv_tp_vars_put(tp_vars); return 0; } /** * batadv_tp_start_kthread() - start new thread which manages the tp meter * sender * @tp_vars: the private data of the current TP meter session */ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) { struct task_struct *kthread; struct batadv_priv *bat_priv = tp_vars->bat_priv; u32 session_cookie; kref_get(&tp_vars->refcount); kthread = kthread_create(batadv_tp_send, tp_vars, "kbatadv_tp_meter"); if (IS_ERR(kthread)) { session_cookie = batadv_tp_session_cookie(tp_vars->session, tp_vars->icmp_uid); pr_err("batadv: cannot create tp meter kthread\n"); batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, tp_vars->other_end, bat_priv, session_cookie); /* drop reserved reference for kthread */ batadv_tp_vars_put(tp_vars); /* cleanup of failed tp meter variables */ batadv_tp_sender_cleanup(bat_priv, tp_vars); return; } wake_up_process(kthread); } /** * batadv_tp_start() - start a new tp meter session * @bat_priv: the bat priv with all the mesh interface information * @dst: the receiver MAC address * @test_length: test length in milliseconds * @cookie: session cookie */ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie) { struct batadv_tp_vars *tp_vars; u8 session_id[2]; u8 icmp_uid; u32 session_cookie; get_random_bytes(session_id, sizeof(session_id)); get_random_bytes(&icmp_uid, 1); session_cookie = batadv_tp_session_cookie(session_id, icmp_uid); *cookie = session_cookie; /* look for an already existing test towards this node */ spin_lock_bh(&bat_priv->tp_list_lock); tp_vars = batadv_tp_list_find(bat_priv, dst); if (tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); batadv_tp_vars_put(tp_vars); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: test to or from the same node already ongoing, aborting\n"); batadv_tp_batctl_error_notify(BATADV_TP_REASON_ALREADY_ONGOING, dst, bat_priv, session_cookie); return; } if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: too many ongoing sessions, aborting (SEND)\n"); batadv_tp_batctl_error_notify(BATADV_TP_REASON_TOO_MANY, dst, bat_priv, session_cookie); return; } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s cannot allocate list elements\n", __func__); batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, dst, bat_priv, session_cookie); return; } /* initialize tp_vars */ ether_addr_copy(tp_vars->other_end, dst); kref_init(&tp_vars->refcount); tp_vars->role = BATADV_TP_SENDER; atomic_set(&tp_vars->sending, 1); memcpy(tp_vars->session, session_id, sizeof(session_id)); tp_vars->icmp_uid = icmp_uid; tp_vars->last_sent = BATADV_TP_FIRST_SEQ; atomic_set(&tp_vars->last_acked, BATADV_TP_FIRST_SEQ); tp_vars->fast_recovery = false; tp_vars->recover = BATADV_TP_FIRST_SEQ; /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681). * For batman-adv the MSS is the size of the payload received by the * mesh_interface, hence its MTU */ tp_vars->cwnd = BATADV_TP_PLEN * 3; /* at the beginning initialise the SS threshold to the biggest possible * window size, hence the AWND size */ tp_vars->ss_threshold = BATADV_TP_AWND; /* RTO initial value is 3 seconds. * Details in Section 2.1 of RFC6298 */ tp_vars->rto = 1000; tp_vars->srtt = 0; tp_vars->rttvar = 0; atomic64_set(&tp_vars->tot_sent, 0); kref_get(&tp_vars->refcount); timer_setup(&tp_vars->timer, batadv_tp_sender_timeout, 0); tp_vars->bat_priv = bat_priv; tp_vars->start_time = jiffies; init_waitqueue_head(&tp_vars->more_bytes); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); spin_lock_init(&tp_vars->cwnd_lock); tp_vars->prerandom_offset = 0; spin_lock_init(&tp_vars->prerandom_lock); kref_get(&tp_vars->refcount); hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); spin_unlock_bh(&bat_priv->tp_list_lock); tp_vars->test_length = test_length; if (!tp_vars->test_length) tp_vars->test_length = BATADV_TP_DEF_TEST_LENGTH; batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: starting throughput meter towards %pM (length=%ums)\n", dst, test_length); /* init work item for finished tp tests */ INIT_DELAYED_WORK(&tp_vars->finish_work, batadv_tp_sender_finish); /* start tp kthread. This way the write() call issued from userspace can * happily return and avoid to block */ batadv_tp_start_kthread(tp_vars); /* don't return reference to new tp_vars */ batadv_tp_vars_put(tp_vars); } /** * batadv_tp_stop() - stop currently running tp meter session * @bat_priv: the bat priv with all the mesh interface information * @dst: the receiver MAC address * @return_value: reason for tp meter session stop */ void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, u8 return_value) { struct batadv_orig_node *orig_node; struct batadv_tp_vars *tp_vars; batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: stopping test towards %pM\n", dst); orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) return; tp_vars = batadv_tp_list_find(bat_priv, orig_node->orig); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: trying to interrupt an already over connection\n"); goto out; } batadv_tp_sender_shutdown(tp_vars, return_value); batadv_tp_vars_put(tp_vars); out: batadv_orig_node_put(orig_node); } /** * batadv_tp_reset_receiver_timer() - reset the receiver shutdown timer * @tp_vars: the private data of the current TP meter session * * start the receiver shutdown timer or reset it if already started */ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) { mod_timer(&tp_vars->timer, jiffies + msecs_to_jiffies(BATADV_TP_RECV_TIMEOUT)); } /** * batadv_tp_receiver_shutdown() - stop a tp meter receiver when timeout is * reached without received ack * @t: address to timer_list inside tp_vars */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_tp_unacked *un, *safe; struct batadv_priv *bat_priv; bat_priv = tp_vars->bat_priv; /* if there is recent activity rearm the timer */ if (!batadv_has_timed_out(tp_vars->last_recv_time, BATADV_TP_RECV_TIMEOUT)) { /* reset the receiver shutdown timer */ batadv_tp_reset_receiver_timer(tp_vars); return; } batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Shutting down for inactivity (more than %dms) from %pM\n", BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); hlist_del_rcu(&tp_vars->list); spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); /* drop list reference */ batadv_tp_vars_put(tp_vars); atomic_dec(&bat_priv->tp_num); spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { list_del(&un->list); kfree(un); } spin_unlock_bh(&tp_vars->unacked_lock); /* drop reference of timer */ batadv_tp_vars_put(tp_vars); } /** * batadv_tp_send_ack() - send an ACK packet * @bat_priv: the bat priv with all the mesh interface information * @dst: the mac address of the destination originator * @seq: the sequence number to ACK * @timestamp: the timestamp to echo back in the ACK * @session: session identifier * @socket_index: local ICMP socket identifier * * Return: 0 on success, a positive integer representing the reason of the * failure otherwise */ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, u32 seq, __be32 timestamp, const u8 *session, int socket_index) { struct batadv_hard_iface *primary_if = NULL; struct batadv_orig_node *orig_node; struct batadv_icmp_tp_packet *icmp; struct sk_buff *skb; int r, ret; orig_node = batadv_orig_hash_find(bat_priv, dst); if (unlikely(!orig_node)) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } primary_if = batadv_primary_if_get_selected(bat_priv); if (unlikely(!primary_if)) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } skb = netdev_alloc_skb_ip_align(NULL, sizeof(*icmp) + ETH_HLEN); if (unlikely(!skb)) { ret = BATADV_TP_REASON_MEMORY_ERROR; goto out; } skb_reserve(skb, ETH_HLEN); icmp = skb_put(skb, sizeof(*icmp)); icmp->packet_type = BATADV_ICMP; icmp->version = BATADV_COMPAT_VERSION; icmp->ttl = BATADV_TTL; icmp->msg_type = BATADV_TP; ether_addr_copy(icmp->dst, orig_node->orig); ether_addr_copy(icmp->orig, primary_if->net_dev->dev_addr); icmp->uid = socket_index; icmp->subtype = BATADV_TP_ACK; memcpy(icmp->session, session, sizeof(icmp->session)); icmp->seqno = htonl(seq); icmp->timestamp = timestamp; /* send the ack */ r = batadv_send_skb_to_orig(skb, orig_node, NULL); if (unlikely(r < 0) || r == NET_XMIT_DROP) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } ret = 0; out: batadv_orig_node_put(orig_node); batadv_hardif_put(primary_if); return ret; } /** * batadv_tp_handle_out_of_order() - store an out of order packet * @tp_vars: the private data of the current TP meter session * @skb: the buffer containing the received packet * * Store the out of order packet in the unacked list for late processing. This * packets are kept in this list so that they can be ACKed at once as soon as * all the previous packets have been received * * Return: true if the packed has been successfully processed, false otherwise */ static bool batadv_tp_handle_out_of_order(struct batadv_tp_vars *tp_vars, const struct sk_buff *skb) { const struct batadv_icmp_tp_packet *icmp; struct batadv_tp_unacked *un, *new; u32 payload_len; bool added = false; new = kmalloc_obj(*new, GFP_ATOMIC); if (unlikely(!new)) return false; icmp = (struct batadv_icmp_tp_packet *)skb->data; new->seqno = ntohl(icmp->seqno); payload_len = skb->len - sizeof(struct batadv_unicast_packet); new->len = payload_len; spin_lock_bh(&tp_vars->unacked_lock); /* if the list is empty immediately attach this new object */ if (list_empty(&tp_vars->unacked_list)) { list_add(&new->list, &tp_vars->unacked_list); goto out; } /* otherwise loop over the list and either drop the packet because this * is a duplicate or store it at the right position. * * The iteration is done in the reverse way because it is likely that * the last received packet (the one being processed now) has a bigger * seqno than all the others already stored. */ list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) { /* check for duplicates */ if (new->seqno == un->seqno) { if (new->len > un->len) un->len = new->len; kfree(new); added = true; break; } /* look for the right position */ if (batadv_seq_before(new->seqno, un->seqno)) continue; /* as soon as an entry having a bigger seqno is found, the new * one is attached _after_ it. In this way the list is kept in * ascending order */ list_add_tail(&new->list, &un->list); added = true; break; } /* received packet with smallest seqno out of order; add it to front */ if (!added) list_add(&new->list, &tp_vars->unacked_list); out: spin_unlock_bh(&tp_vars->unacked_lock); return true; } /** * batadv_tp_ack_unordered() - update number received bytes in current stream * without gaps * @tp_vars: the private data of the current TP meter session */ static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars) { struct batadv_tp_unacked *un, *safe; u32 to_ack; /* go through the unacked packet list and possibly ACK them as * well */ spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { /* the list is ordered, therefore it is possible to stop as soon * there is a gap between the last acked seqno and the seqno of * the packet under inspection */ if (batadv_seq_before(tp_vars->last_recv, un->seqno)) break; to_ack = un->seqno + un->len - tp_vars->last_recv; if (batadv_seq_before(tp_vars->last_recv, un->seqno + un->len)) tp_vars->last_recv += to_ack; list_del(&un->list); kfree(un); } spin_unlock_bh(&tp_vars->unacked_lock); } /** * batadv_tp_init_recv() - return matching or create new receiver tp_vars * @bat_priv: the bat priv with all the mesh interface information * @icmp: received icmp tp msg * * Return: corresponding tp_vars or NULL on errors */ static struct batadv_tp_vars * batadv_tp_init_recv(struct batadv_priv *bat_priv, const struct batadv_icmp_tp_packet *icmp) { struct batadv_tp_vars *tp_vars; spin_lock_bh(&bat_priv->tp_list_lock); tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, icmp->session); if (tp_vars) goto out_unlock; if (!atomic_add_unless(&bat_priv->tp_num, 1, BATADV_TP_MAX_NUM)) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: too many ongoing sessions, aborting (RECV)\n"); goto out_unlock; } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) goto out_unlock; ether_addr_copy(tp_vars->other_end, icmp->orig); tp_vars->role = BATADV_TP_RECEIVER; memcpy(tp_vars->session, icmp->session, sizeof(tp_vars->session)); tp_vars->last_recv = BATADV_TP_FIRST_SEQ; tp_vars->bat_priv = bat_priv; kref_init(&tp_vars->refcount); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); kref_get(&tp_vars->refcount); hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list); kref_get(&tp_vars->refcount); timer_setup(&tp_vars->timer, batadv_tp_receiver_shutdown, 0); batadv_tp_reset_receiver_timer(tp_vars); out_unlock: spin_unlock_bh(&bat_priv->tp_list_lock); return tp_vars; } /** * batadv_tp_recv_msg() - process a single data message * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet * * Process a received TP MSG packet */ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv, const struct sk_buff *skb) { const struct batadv_icmp_tp_packet *icmp; struct batadv_tp_vars *tp_vars; size_t packet_size; u32 seqno; icmp = (struct batadv_icmp_tp_packet *)skb->data; seqno = ntohl(icmp->seqno); /* check if this is the first seqno. This means that if the * first packet is lost, the tp meter does not work anymore! */ if (seqno == BATADV_TP_FIRST_SEQ) { tp_vars = batadv_tp_init_recv(bat_priv, icmp); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: seqno != BATADV_TP_FIRST_SEQ cannot initiate connection\n"); goto out; } } else { tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, icmp->session); if (!tp_vars) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Unexpected packet from %pM!\n", icmp->orig); goto out; } } if (unlikely(tp_vars->role != BATADV_TP_RECEIVER)) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: dropping packet: not expected (role=%u)\n", tp_vars->role); goto out; } tp_vars->last_recv_time = jiffies; /* if the packet is a duplicate, it may be the case that an ACK has been * lost. Resend the ACK */ if (batadv_seq_before(seqno, tp_vars->last_recv)) goto send_ack; /* if the packet is out of order enqueue it */ if (ntohl(icmp->seqno) != tp_vars->last_recv) { /* exit immediately (and do not send any ACK) if the packet has * not been enqueued correctly */ if (!batadv_tp_handle_out_of_order(tp_vars, skb)) goto out; /* send a duplicate ACK */ goto send_ack; } /* if everything was fine count the ACKed bytes */ packet_size = skb->len - sizeof(struct batadv_unicast_packet); tp_vars->last_recv += packet_size; /* check if this ordered message filled a gap.... */ batadv_tp_ack_unordered(tp_vars); send_ack: /* send the ACK. If the received packet was out of order, the ACK that * is going to be sent is a duplicate (the sender will count them and * possibly enter Fast Retransmit as soon as it has reached 3) */ batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv, icmp->timestamp, icmp->session, icmp->uid); out: batadv_tp_vars_put(tp_vars); } /** * batadv_tp_meter_recv() - main TP Meter receiving function * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet */ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_icmp_tp_packet *icmp; icmp = (struct batadv_icmp_tp_packet *)skb->data; switch (icmp->subtype) { case BATADV_TP_MSG: batadv_tp_recv_msg(bat_priv, skb); break; case BATADV_TP_ACK: batadv_tp_recv_ack(bat_priv, skb); break; default: batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Received unknown TP Metric packet type %u\n", icmp->subtype); } consume_skb(skb); } /** * batadv_tp_meter_init() - initialize global tp_meter structures */ void __init batadv_tp_meter_init(void) { get_random_bytes(batadv_tp_prerandom, sizeof(batadv_tp_prerandom)); }
116 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
137 1802 1798 1803 76 3557 3563 3565 3555 3569 3561 3066 2799 283 1542 1548 1644 1452 2841 256 3060 3064 3060 23 23 1704 1703 1704 1676 12 11 465 1264 1625 77 1704 1703 1690 1693 372 504 136 381 38119 38437 355 1351 1349 196 197 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; union { struct path user_path; freeptr_t bf_freeptr; }; }; #define backing_file(f) container_of(f, struct backing_file, file) const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); void backing_file_set_user_path(struct file *f, const struct path *path) { backing_file(f)->user_path = *path; } EXPORT_SYMBOL_GPL(backing_file_set_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = percpu_counter_sum_positive(&nr_files); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } spin_lock_init(&f->f_lock); /* * Note that f_pos_lock is only used for files raising * FMODE_ATOMIC_POS and directories. Other files such as pipes * don't need it and since f_pos_lock is in a union may reuse * the space for other purposes. They are expected to initialize * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* * Disable permission and pre-content events for all files by default. * They may be enabled later by fsnotify_open_perm_and_set_mode(). */ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); f->f_op = NULL; f->f_mapping = NULL; f->private_data = NULL; f->f_inode = NULL; f->f_owner = NULL; #ifdef CONFIG_EPOLL f->f_ep = NULL; #endif f->f_iocb_flags = 0; f->f_pos = 0; f->f_wb_err = 0; f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (unlikely(get_nr_files() >= files_stat.max_files) && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } EXPORT_SYMBOL_GPL(alloc_empty_backing_file); /** * file_init_path - initialize a 'struct file' based on path * * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file * @fop: the 'struct file_operations' for the new file */ static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (!IS_ERR(file)) file_init_path(file, path, fop); return file; } static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); if (!path->dentry) return -ENOMEM; path->mnt = mntget(mnt); d_instantiate(path->dentry, inode); return 0; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_pseudo_noaccount(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_empty_file_noaccount(flags, current_cred()); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } file_init_path(file, &path, fops); /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f; f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static void __fput_deferred(struct file *file) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } void fput(struct file *file) { if (unlikely(file_ref_put(&file->f_ref))) __fput_deferred(file); } EXPORT_SYMBOL(fput); /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (file_ref_put(&file->f_ref)) __fput(file); } EXPORT_SYMBOL(__fput_sync); /* * Equivalent to __fput_sync(), but optimized for being called with the last * reference. * * See file_ref_put_close() for details. */ void fput_close_sync(struct file *file) { if (likely(file_ref_put_close(&file->f_ref))) __fput(file); } /* * Equivalent to fput(), but optimized for being called with the last * reference. * * See file_ref_put_close() for details. */ void fput_close(struct file *file) { if (file_ref_put_close(&file->f_ref)) __fput_deferred(file); } void __init files_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct file, f_freeptr), }; filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
31 11 13 23 21 18 26 2 23 10 10 10 25 26 26 26 26 26 15 11 26 3 2 11 11 11 11 11 6 5 5 11 102 102 95 31 31 31 19 13 66 77 77 77 19 30 29 62 11 1 10 9 2 7 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation */ #include <linux/kernel.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/printk.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> #include <net/gso.h> #include "ip6_offload.h" #include "tcpv6_offload.c" static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { const struct net_offload *ops = NULL; struct ipv6_opt_hdr *opth; for (;;) { int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = skb_gro_header(skb, off + sizeof(*opth), off); if (unlikely(!opth)) break; len = ipv6_optlen(opth); opth = skb_gro_header(skb, off + len, off); if (unlikely(!opth)) break; proto = opth->nexthdr; off += len; } skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; for (;;) { struct ipv6_opt_hdr *opth; int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; if (unlikely(!pskb_may_pull(skb, 8))) break; opth = (void *)skb->data; len = ipv6_optlen(opth); if (unlikely(!pskb_may_pull(skb, len))) break; opth = (void *)skb->data; proto = opth->nexthdr; __skb_pull(skb, len); } return proto; } static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; int proto; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; bool gso_partial; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { if (!skb_reset_transport_header_careful(skb)) goto out; segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); else payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) { kfree_skb_list(segs); return ERR_PTR(err); } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } if (encap) skb_reset_inner_headers(skb); } out: return segs; } /* Return the total length of all the extension hdrs, following the same * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. */ static int ipv6_exthdrs_len(struct ipv6hdr *iph, const struct net_offload **opps) { struct ipv6_opt_hdr *opth = (void *)iph; int len = 0, proto, optlen = sizeof(*iph); proto = iph->nexthdr; for (;;) { *opps = rcu_dereference(inet6_offloads[proto]); if (unlikely(!(*opps))) break; if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; proto = opth->nexthdr; } return len; } INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; unsigned int hlen; unsigned int off; u16 flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; iph = skb_gro_network_header(skb); } else { skb_gro_pull(skb, sizeof(*iph)); } skb_set_transport_header(skb, skb_gro_offset(skb)); NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || iph->nexthdr != iph2->nexthdr) { not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } if (unlikely(nlen > sizeof(struct ipv6hdr))) { if (memcmp(iph + 1, iph2 + 1, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } } NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); if (unlikely(gro_recursion_inc_test(skb))) { flush = 1; goto out; } if (likely(proto == IPPROTO_TCP)) pp = tcp6_gro_receive(head, skb); #if IS_BUILTIN(CONFIG_IPV6) else if (likely(proto == IPPROTO_UDP)) pp = udp6_gro_receive(head, skb); #endif else pp = ops->callbacks.gro_receive(head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return ipv6_gro_receive(head, skb); } static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } iph = (struct ipv6hdr *)(skb->data + nhoff); ipv6_set_payload_len(iph, skb->len - nhoff - sizeof(*iph)); nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (likely(ops == &net_hotdata.tcpv6_offload)) return tcp6_gro_complete(skb, nhoff); #if IS_BUILTIN(CONFIG_IPV6) if (ops == &net_hotdata.udpv6_offload) return udp6_gro_complete(skb, nhoff); #endif if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = ops->callbacks.gro_complete(skb, nhoff); out: return err; } static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return ipv6_gro_complete(skb, nhoff); } static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return inet_gro_complete(skb, nhoff); } static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static const struct net_offload sit_offload = { .callbacks = { .gso_segment = sit_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; static const struct net_offload ip4ip6_offload = { .callbacks = { .gso_segment = ip4ip6_gso_segment, .gro_receive = ip4ip6_gro_receive, .gro_complete = ip4ip6_gro_complete, }, }; static const struct net_offload ip6ip6_offload = { .callbacks = { .gso_segment = ip6ip6_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = ip6ip6_gro_complete, }, }; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); net_hotdata.ipv6_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, }, }; dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } fs_initcall(ipv6_offload_init);
16 257 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ #define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return cpu_feature_enabled(X86_FEATURE_OSPKE); } /* * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey); static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); } #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; /* * The exec-only pkey is set in the allocation map, but * is not available to any of the user interfaces like * mprotect_pkey(). */ if (pkey == mm->context.execute_only_pkey) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 4-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); int ret; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } #endif /*_ASM_X86_PKEYS_H */
15 17 1 2 15 3 3 1 36 2 16 16 1 17 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/types.h> #include <net/pkt_sched.h> #include "sch_mqprio_lib.h" /* Returns true if the intervals [a, b) and [c, d) overlap. */ static bool intervals_overlap(int a, int b, int c, int d) { int left = max(a, c), right = min(b, d); return left < right; } static int mqprio_validate_queue_counts(struct net_device *dev, const struct tc_mqprio_qopt *qopt, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, j; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; if (!qopt->count[i]) { NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", i); return -EINVAL; } /* Verify the queue count is in tx range being equal to the * real_num_tx_queues indicates the last queue is in use. */ if (qopt->offset[i] >= dev->real_num_tx_queues || last > dev->real_num_tx_queues) { NL_SET_ERR_MSG_FMT_MOD(extack, "Queues %d:%d for TC %d exceed the %d TX queues available", qopt->count[i], qopt->offset[i], i, dev->real_num_tx_queues); return -EINVAL; } if (allow_overlapping_txqs) continue; /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (intervals_overlap(qopt->offset[i], last, qopt->offset[j], qopt->offset[j] + qopt->count[j])) { NL_SET_ERR_MSG_FMT_MOD(extack, "TC %d queues %d@%d overlap with TC %d queues %d@%d", i, qopt->count[i], qopt->offset[i], j, qopt->count[j], qopt->offset[j]); return -EINVAL; } } } return 0; } int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, bool validate_queue_counts, bool allow_overlapping_txqs, struct netlink_ext_ack *extack) { int i, err; /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) { NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); return -EINVAL; } /* Verify priority mapping uses valid tcs */ for (i = 0; i <= TC_BITMASK; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) { NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); return -EINVAL; } } if (validate_queue_counts) { err = mqprio_validate_queue_counts(dev, qopt, allow_overlapping_txqs, extack); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(mqprio_validate_qopt); void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) { int tc, num_tc = netdev_get_num_tc(dev); qopt->num_tc = num_tc; memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); for (tc = 0; tc < num_tc; tc++) { qopt->count[tc] = dev->tc_to_txq[tc].count; qopt->offset[tc] = dev->tc_to_txq[tc].offset; } } EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], struct tc_mqprio_qopt_offload *mqprio) { unsigned long preemptible_tcs = 0; int tc; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) if (fp[tc] == TC_FP_PREEMPTIBLE) preemptible_tcs |= BIT(tc); mqprio->preemptible_tcs = preemptible_tcs; } EXPORT_SYMBOL_GPL(mqprio_fp_to_offload); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Shared mqprio qdisc code currently between taprio and mqprio");
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 // SPDX-License-Identifier: GPL-2.0-only /* * LED support for the input layer * * Copyright 2010-2015 Samuel Thibault <samuel.thibault@ens-lyon.org> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/leds.h> #include <linux/input.h> #if IS_ENABLED(CONFIG_VT) #define VT_TRIGGER(_name) .trigger = _name #else #define VT_TRIGGER(_name) .trigger = NULL #endif #if IS_ENABLED(CONFIG_SND_CTL_LED) #define AUDIO_TRIGGER(_name) .trigger = _name #else #define AUDIO_TRIGGER(_name) .trigger = NULL #endif static const struct { const char *name; const char *trigger; } input_led_info[LED_CNT] = { [LED_NUML] = { "numlock", VT_TRIGGER("kbd-numlock") }, [LED_CAPSL] = { "capslock", VT_TRIGGER("kbd-capslock") }, [LED_SCROLLL] = { "scrolllock", VT_TRIGGER("kbd-scrolllock") }, [LED_COMPOSE] = { "compose" }, [LED_KANA] = { "kana", VT_TRIGGER("kbd-kanalock") }, [LED_SLEEP] = { "sleep" } , [LED_SUSPEND] = { "suspend" }, [LED_MUTE] = { "mute", AUDIO_TRIGGER("audio-mute") }, [LED_MISC] = { "misc" }, [LED_MAIL] = { "mail" }, [LED_CHARGING] = { "charging" }, }; struct input_led { struct led_classdev cdev; struct input_handle *handle; unsigned int code; /* One of LED_* constants */ }; struct input_leds { struct input_handle handle; unsigned int num_leds; struct input_led leds[] __counted_by(num_leds); }; static enum led_brightness input_leds_brightness_get(struct led_classdev *cdev) { struct input_led *led = container_of(cdev, struct input_led, cdev); struct input_dev *input = led->handle->dev; return test_bit(led->code, input->led) ? cdev->max_brightness : 0; } static void input_leds_brightness_set(struct led_classdev *cdev, enum led_brightness brightness) { struct input_led *led = container_of(cdev, struct input_led, cdev); input_inject_event(led->handle, EV_LED, led->code, !!brightness); } static void input_leds_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { } static int input_leds_get_count(struct input_dev *dev) { unsigned int led_code; int count = 0; for_each_set_bit(led_code, dev->ledbit, LED_CNT) if (input_led_info[led_code].name) count++; return count; } static int input_leds_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct input_leds *leds; struct input_led *led; unsigned int num_leds; unsigned int led_code; int led_no; int error; num_leds = input_leds_get_count(dev); if (!num_leds) return -ENXIO; leds = kzalloc_flex(*leds, leds, num_leds); if (!leds) return -ENOMEM; leds->num_leds = num_leds; leds->handle.dev = dev; leds->handle.handler = handler; leds->handle.name = "leds"; leds->handle.private = leds; error = input_register_handle(&leds->handle); if (error) goto err_free_mem; error = input_open_device(&leds->handle); if (error) goto err_unregister_handle; led_no = 0; for_each_set_bit(led_code, dev->ledbit, LED_CNT) { if (!input_led_info[led_code].name) continue; led = &leds->leds[led_no]; led->handle = &leds->handle; led->code = led_code; led->cdev.name = kasprintf(GFP_KERNEL, "%s::%s", dev_name(&dev->dev), input_led_info[led_code].name); if (!led->cdev.name) { error = -ENOMEM; goto err_unregister_leds; } led->cdev.max_brightness = 1; led->cdev.brightness_get = input_leds_brightness_get; led->cdev.brightness_set = input_leds_brightness_set; led->cdev.default_trigger = input_led_info[led_code].trigger; error = led_classdev_register(&dev->dev, &led->cdev); if (error) { dev_err(&dev->dev, "failed to register LED %s: %d\n", led->cdev.name, error); kfree(led->cdev.name); goto err_unregister_leds; } led_no++; } return 0; err_unregister_leds: while (--led_no >= 0) { struct input_led *led = &leds->leds[led_no]; led_classdev_unregister(&led->cdev); kfree(led->cdev.name); } input_close_device(&leds->handle); err_unregister_handle: input_unregister_handle(&leds->handle); err_free_mem: kfree(leds); return error; } static void input_leds_disconnect(struct input_handle *handle) { struct input_leds *leds = handle->private; int i; for (i = 0; i < leds->num_leds; i++) { struct input_led *led = &leds->leds[i]; led_classdev_unregister(&led->cdev); kfree(led->cdev.name); } input_close_device(handle); input_unregister_handle(handle); kfree(leds); } static const struct input_device_id input_leds_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT, .evbit = { BIT_MASK(EV_LED) }, }, { }, }; MODULE_DEVICE_TABLE(input, input_leds_ids); static struct input_handler input_leds_handler = { .event = input_leds_event, .connect = input_leds_connect, .disconnect = input_leds_disconnect, .name = "leds", .id_table = input_leds_ids, }; static int __init input_leds_init(void) { return input_register_handler(&input_leds_handler); } module_init(input_leds_init); static void __exit input_leds_exit(void) { input_unregister_handler(&input_leds_handler); } module_exit(input_leds_exit); MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>"); MODULE_AUTHOR("Dmitry Torokhov <dmitry.torokhov@gmail.com>"); MODULE_DESCRIPTION("Input -> LEDs Bridge"); MODULE_LICENSE("GPL v2");
3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/sfp.h> #include "netlink.h" #include "common.h" struct eeprom_req_info { struct ethnl_req_info base; u32 offset; u32 length; u8 page; u8 bank; u8 i2c_address; }; struct eeprom_reply_data { struct ethnl_reply_data base; u32 length; u8 *data; }; #define MODULE_EEPROM_REQINFO(__req_base) \ container_of(__req_base, struct eeprom_req_info, base) #define MODULE_EEPROM_REPDATA(__reply_base) \ container_of(__reply_base, struct eeprom_reply_data, base) static int fallback_set_params(struct eeprom_req_info *request, struct ethtool_modinfo *modinfo, struct ethtool_eeprom *eeprom) { u32 offset = request->offset; u32 length = request->length; if (request->page) offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; if (modinfo->type == ETH_MODULE_SFF_8472 && request->i2c_address == 0x51) offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; if (offset >= modinfo->eeprom_len) return -EINVAL; eeprom->cmd = ETHTOOL_GMODULEEEPROM; eeprom->len = length; eeprom->offset = offset; return 0; } static int eeprom_fallback(struct eeprom_req_info *request, struct eeprom_reply_data *reply) { struct net_device *dev = reply->base.dev; struct ethtool_modinfo modinfo = {0}; struct ethtool_eeprom eeprom = {0}; u8 *data; int err; modinfo.cmd = ETHTOOL_GMODULEINFO; err = ethtool_get_module_info_call(dev, &modinfo); if (err < 0) return err; err = fallback_set_params(request, &modinfo, &eeprom); if (err < 0) return err; data = kmalloc(eeprom.len, GFP_KERNEL); if (!data) return -ENOMEM; err = ethtool_get_module_eeprom_call(dev, &eeprom, data); if (err < 0) goto err_out; reply->data = data; reply->length = eeprom.len; return 0; err_out: kfree(data); return err; } static int get_module_eeprom_by_page(struct net_device *dev, struct ethtool_module_eeprom *page_data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } if (dev->sfp_bus) return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack); if (ops->get_module_eeprom_by_page) return ops->get_module_eeprom_by_page(dev, page_data, extack); return -EOPNOTSUPP; } static int eeprom_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); struct ethtool_module_eeprom page_data = {0}; struct net_device *dev = reply_base->dev; int ret; page_data.offset = request->offset; page_data.length = request->length; page_data.i2c_address = request->i2c_address; page_data.page = request->page; page_data.bank = request->bank; page_data.data = kmalloc(page_data.length, GFP_KERNEL); if (!page_data.data) return -ENOMEM; ret = ethnl_ops_begin(dev); if (ret) goto err_free; ret = get_module_eeprom_by_page(dev, &page_data, info->extack); if (ret < 0) goto err_ops; reply->length = ret; reply->data = page_data.data; ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_free: kfree(page_data.data); if (ret == -EOPNOTSUPP) return eeprom_fallback(request, reply); return ret; } static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) return -EINVAL; request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); /* The following set of conditions limit the API to only dump 1/2 * EEPROM page without crossing low page boundary located at offset 128. * This means user may only request dumps of length limited to 128 from * either low 128 bytes or high 128 bytes. * For pages higher than 0 only high 128 bytes are accessible. */ request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], "reading from lower half page is allowed for page 0 only"); return -EINVAL; } if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross half page boundary is illegal"); return -EINVAL; } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross page boundary is illegal"); return -EINVAL; } if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); return 0; } static int eeprom_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ } static int eeprom_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); } static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); kfree(reply->data); } const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, .req_info_size = sizeof(struct eeprom_req_info), .reply_data_size = sizeof(struct eeprom_reply_data), .parse_request = eeprom_parse_request, .prepare_data = eeprom_prepare_data, .reply_size = eeprom_reply_size, .fill_reply = eeprom_fill_reply, .cleanup_data = eeprom_cleanup_data, }; const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_EEPROM_OFFSET] = NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1), [ETHTOOL_A_MODULE_EEPROM_LENGTH] = NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), };
14 14 6 6 2 1 14 14 6 9 3 2 35 35 1 2 237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp; if (tp->tcp_usec_ts) elapsed /= USEC_PER_MSEC; remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); } /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. * * Returns: Nothing (void) */ static void tcp_write_err(struct sock *sk) { tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /** * tcp_out_of_resources() - Close socket if out of resources * @sk: pointer to current socket * @do_reset: send a last packet with reset flag * * Do not allow orphaned sockets to eat all our resources. * This is direct violation of TCP specs, but it is required * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * * Also close if our net namespace is exiting; in that case there is no * hope of ever communicating again since all netns interfaces are already * down (or about to be down), and we need to release our dst references, * which have been moved to the netns loopback interface, so the namespace * can finish exiting. This condition is only possible if we are a kernel * socket, as those do not hold references to the namespace. * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { struct tcp_sock *tp = tcp_sk(sk); int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ if (READ_ONCE(sk->sk_err_soft)) shift++; if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_ABORT_ON_MEMORY); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_done(sk); return 1; } return 0; } /** * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket * @sk: Pointer to the current socket. * @alive: bool, socket alive state */ static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (READ_ONCE(sk->sk_err_soft) && !alive) retries = 0; /* However, if socket sent something recently, select some safe * number of retries. 8 corresponds to >100 seconds with minimal * RTO of 200msec. */ if (retries == 0 && alive) retries = 8; return retries; } static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { const struct net *net = sock_net(sk); int mss; /* Black hole detection */ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) return; if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } static unsigned int tcp_model_timeout(struct sock *sk, unsigned int boundary, unsigned int rto_base) { unsigned int linear_backoff_thresh, timeout; linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * tcp_rto_max(sk); return jiffies_to_msecs(timeout); } /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket * @boundary: max number of retransmissions * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { struct tcp_sock *tp = tcp_sk(sk); unsigned int start_ts, delta; if (!inet_csk(sk)->icsk_retransmits) return false; start_ts = tp->retrans_stamp; if (likely(timeout == 0)) { unsigned int rto_base = TCP_RTO_MIN; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) rto_base = tcp_timeout_init(sk); timeout = tcp_model_timeout(sk, boundary, rto_base); } if (tp->tcp_usec_ts) { /* delta maybe off up to a jiffy due to timer granularity. */ delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1); return (s32)(delta - timeout * USEC_PER_MSEC) >= 0; } return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool expired = false, do_reset; int retry_until, max_retransmits; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; if (sk->sk_state == TCP_SYN_SENT) max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts); expired = icsk->icsk_retransmits >= max_retransmits; } else { if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); __dst_negative_advice(sk); } retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < tcp_rto_max(sk); retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } if (!expired) expired = retransmits_timed_out(sk, retry_until, READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); mptcp_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, icsk->icsk_retransmits, icsk->icsk_rto, (int)expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } if (sk_rethink_txhash(sk)) { tp->timeout_rehash++; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); } return 0; } /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; /* Handling the sack compression case */ if (tp->compressed_ack) { tcp_mstamp_refresh(tp); tcp_sack_compress_send_ack(sk); return; } if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; if (time_after(icsk_delack_timeout(icsk), jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk_delack_timeout(icsk)); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tp); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } } /** * tcp_delack_timer() - The TCP delayed ACK timeout handler * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. * * Returns: Nothing (void) */ static void tcp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = timer_container_of(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; /* Avoid taking socket spinlock if there is no ACK to send. * The compressed_ack check is racy, but a separate hrtimer * will take care of it eventually. */ if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) && !READ_ONCE(tcp_sk(sk)->compressed_ack)) goto out; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); out: sock_put(sk); } static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !skb) { WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; return; } /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as * long as the receiver continues to respond probes. We support this by * default and reset icsk_probes_out with incoming ACKs. But if the * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we * kill the socket when the retry count and the time exceeds the * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; } else { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (user_timeout && (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= msecs_to_jiffies(user_timeout)) goto abort; } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { unsigned int rto_max = tcp_rto_max(sk); const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) goto abort; if (tcp_out_of_resources(sk, true)) return; } if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } } static void tcp_update_rto_stats(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (!icsk->icsk_retransmits) { tp->total_rto_recoveries++; tp->rto_stamp = tcp_time_stamp_ms(tp); } WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1); tp->total_rto++; } /* * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_retries; tcp_syn_ack_timeout(req); /* Add one more retry for fastopen. * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { tcp_write_err(sk); return; } /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ if (icsk->icsk_retransmits == 1) tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ tcp_rtx_synack(sk, req); if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, req->timeout << req->num_timeout, false); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb, u32 rtx_delta) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); int timeout = tcp_rto_max(sk) * 2; s32 rcv_delta; if (user_timeout) { /* If user application specified a TCP_USER_TIMEOUT, * it does not want win 0 packets to 'reset the timer' * while retransmits are not making progress. */ if (rtx_delta > user_timeout) return true; timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout)); } /* Note: timer interrupt might have been delayed by at least one jiffy, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; return msecs_to_jiffies(rtx_delta) > timeout; } /** * tcp_retransmit_timer() - The TCP retransmit timeout handler * @sk: Pointer to the current socket. * * This function gets called when the kernel timer for a TCP packet * of this socket expires. * * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; struct sk_buff *skb; req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ return; } if (!tp->packets_out) return; skb = tcp_rtx_queue_head(sk); if (WARN_ON_ONCE(!skb)) return; if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits * become zero probes, but we should not timeout this * connection. If the socket is an orphan, time it out, * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); if (tp->tcp_usec_ts) rtx_delta /= USEC_PER_MSEC; if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &sk->sk_v6_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #endif if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) { tcp_write_err(sk); goto out; } tcp_enter_loss(sk); tcp_retransmit_skb(sk, skb, 1); __sk_dst_reset(sk); goto out_reset_timer; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || tp->sacked_out) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; } if (mib_idx) __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); tcp_update_rto_stats(sk); if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_RESOURCE_PROBE_INTERVAL, false); goto out; } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is * used to reset timer, set to 0. Recalculate 'icsk_rto' as this * might be increased if the stream oscillates between thin and thick, * thus the old value might already be too high compared to the value * set by 'tcp_set_rto' in tcp_input.c which resets the rto without * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating * exponential backoff behaviour to avoid continue hammering * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), tcp_rto_max(sk)); } else if (sk->sk_state != TCP_SYN_SENT || tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { /* Use normal (exponential) backoff unless linear timeouts are * activated. */ icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), false); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); out:; } /* Called with bottom-half processing disabled. * Called by tcp_write_timer() and tcp_release_cb(). */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int event; if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !icsk->icsk_pending) return; if (time_after(tcp_timeout_expires(sk), jiffies)) { sk_reset_timer(sk, &sk->tcp_retransmit_timer, tcp_timeout_expires(sk)); return; } tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { case ICSK_TIME_REO_TIMEOUT: tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: smp_store_release(&icsk->icsk_pending, 0); tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: smp_store_release(&icsk->icsk_pending, 0); tcp_probe_timer(sk); break; } } static void tcp_write_timer(struct timer_list *t) { struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer); /* Avoid locking the socket when there is no pending event. */ if (!smp_load_acquire(&inet_csk(sk)->icsk_pending)) goto out; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); out: sock_put(sk); } void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len); } static void tcp_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer); } void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) tcp_delete_keepalive_timer(sk); } EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); static void tcp_keepalive_timer(struct timer_list *t) { struct inet_connection_sock *icsk = timer_container_of(icsk, t, icsk_keepalive_timer); struct sock *sk = &icsk->icsk_inet.sk; struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ tcp_reset_keepalive_timer(sk, HZ/20); goto out; } if (sk->sk_state == TCP_LISTEN) { pr_err("Hmm... keepalive on a LISTEN ???\n"); goto out; } tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE); goto death; } if (!sock_flag(sk, SOCK_KEEPOPEN) || ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ if ((user_timeout != 0 && elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, * try harder. */ elapsed = TCP_RESOURCE_PROBE_INTERVAL; } } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ elapsed = keepalive_time_when(tp) - elapsed; } resched: tcp_reset_keepalive_timer(sk, elapsed); goto out; death: tcp_done(sk); out: bh_unlock_sock(sk); sock_put(sk); } static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); struct sock *sk = (struct sock *)tp; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; tcp_mstamp_refresh(tp); tcp_send_ack(sk); } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); return HRTIMER_NORESTART; } void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_SOFT); hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_SOFT); }
7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 /* * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <rdma/rdma_user_ioctl.h> #include <rdma/uverbs_ioctl.h> #include "rdma_core.h" #include "uverbs.h" struct bundle_alloc_head { struct_group_tagged(bundle_alloc_head_hdr, hdr, struct bundle_alloc_head *next; ); u8 data[]; }; struct bundle_priv { /* Must be first */ struct bundle_alloc_head_hdr alloc_head; struct bundle_alloc_head *allocated_mem; size_t internal_avail; size_t internal_used; struct radix_tree_root *radix; const struct uverbs_api_ioctl_method *method_elm; void __rcu **radix_slots; unsigned long radix_slots_len; u32 method_key; struct ib_uverbs_attr __user *user_attrs; struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps * internal_buffer. */ struct uverbs_attr_bundle_hdr bundle; u64 internal_buffer[32]; }; /* * Each method has an absolute minimum amount of memory it needs to allocate, * precompute that amount and determine if the onstack memory can be used or * if allocation is need. */ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs) { struct bundle_priv *pbundle; struct uverbs_attr_bundle *bundle; size_t bundle_size = offsetof(struct bundle_priv, internal_buffer) + sizeof(*bundle->attrs) * method_elm->key_bitmap_len + sizeof(*pbundle->uattrs) * num_attrs; method_elm->use_stack = bundle_size <= sizeof(*pbundle); method_elm->bundle_size = ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer)); /* Do not want order-2 allocations for this. */ WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE); } /** * _uverbs_alloc() - Quickly allocate memory for use with a bundle * @bundle: The bundle * @size: Number of bytes to allocate * @flags: Allocator flags * * The bundle allocator is intended for allocations that are connected with * processing the system call related to the bundle. The allocated memory is * always freed once the system call completes, and cannot be freed any other * way. * * This tries to use a small pool of pre-allocated memory for performance. */ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, gfp_t flags) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); size_t new_used; void *res; if (check_add_overflow(size, pbundle->internal_used, &new_used)) return ERR_PTR(-EOVERFLOW); if (new_used > pbundle->internal_avail) { struct bundle_alloc_head *buf; buf = kvmalloc_flex(*buf, data, size, flags); if (!buf) return ERR_PTR(-ENOMEM); buf->next = pbundle->allocated_mem; pbundle->allocated_mem = buf; return buf->data; } res = (void *)pbundle->internal_buffer + pbundle->internal_used; pbundle->internal_used = ALIGN(new_used, sizeof(*pbundle->internal_buffer)); if (want_init_on_alloc(flags)) memset(res, 0, size); return res; } EXPORT_SYMBOL(_uverbs_alloc); static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, uattr->len - len); return !memchr_inv((const void *)&uattr->data + len, 0, uattr->len - len); } static int uverbs_set_output(const struct uverbs_attr_bundle *bundle, const struct uverbs_attr *attr) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); u16 flags; flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | UVERBS_ATTR_F_VALID_OUTPUT; if (put_user(flags, &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) return -EFAULT; return 0; } static int uverbs_process_idrs_array(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, struct ib_uverbs_attr *uattr, u32 attr_bkey) { struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr_spec *spec = &attr_uapi->spec; size_t array_len; u32 *idr_vals; int ret = 0; size_t i; if (uattr->attr_data.reserved) return -EINVAL; if (uattr->len % sizeof(u32)) return -EINVAL; array_len = uattr->len / sizeof(u32); if (array_len < spec->u2.objs_arr.min_len || array_len > spec->u2.objs_arr.max_len) return -EINVAL; attr->uobjects = uverbs_alloc(bundle, array_size(array_len, sizeof(*attr->uobjects))); if (IS_ERR(attr->uobjects)) return PTR_ERR(attr->uobjects); /* * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects * to store idrs array and avoid additional memory allocation. The * idrs array is offset to the end of the uobjects array so we will be * able to read idr and replace with a pointer. */ idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; if (uattr->len > sizeof(uattr->data)) { ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), uattr->len); if (ret) return -EFAULT; } else { memcpy(idr_vals, &uattr->data, uattr->len); } for (i = 0; i != array_len; i++) { attr->uobjects[i] = uverbs_get_uobject_from_file( spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, idr_vals[i], bundle); if (IS_ERR(attr->uobjects[i])) { ret = PTR_ERR(attr->uobjects[i]); break; } } attr->len = i; __set_bit(attr_bkey, pbundle->spec_finalize); return ret; } static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, bool commit, struct uverbs_attr_bundle *attrs) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; size_t i; for (i = 0; i != attr->len; i++) uverbs_finalize_object(attr->uobjects[i], spec->u2.objs_arr.access, false, commit, attrs); } static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct uverbs_attr *e = &bundle->attrs[attr_bkey]; const struct uverbs_attr_spec *val_spec = spec; struct uverbs_obj_attr *o_attr; switch (spec->type) { case UVERBS_ATTR_TYPE_ENUM_IN: if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems) return -EOPNOTSUPP; if (uattr->attr_data.enum_data.reserved) return -EINVAL; val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id]; /* Currently we only support PTR_IN based enums */ if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN) return -EOPNOTSUPP; e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; fallthrough; case UVERBS_ATTR_TYPE_PTR_IN: /* Ensure that any data provided by userspace beyond the known * struct is zero. Userspace that knows how to use some future * longer struct will fail here if used with an old kernel and * non-zero content, making ABI compat/discovery simpler. */ if (uattr->len > val_spec->u.ptr.len && val_spec->zero_trailing && !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) return -EOPNOTSUPP; fallthrough; case UVERBS_ATTR_TYPE_PTR_OUT: if (uattr->len < val_spec->u.ptr.min_len || (!val_spec->zero_trailing && uattr->len > val_spec->u.ptr.len)) return -EINVAL; if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN && uattr->attr_data.reserved) return -EINVAL; e->ptr_attr.uattr_idx = uattr - pbundle->uattrs; e->ptr_attr.len = uattr->len; if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) { void *p; p = uverbs_alloc(bundle, uattr->len); if (IS_ERR(p)) return PTR_ERR(p); e->ptr_attr.ptr = p; if (copy_from_user(p, u64_to_user_ptr(uattr->data), uattr->len)) return -EFAULT; } else { e->ptr_attr.data = uattr->data; } break; case UVERBS_ATTR_TYPE_IDR: case UVERBS_ATTR_TYPE_FD: if (uattr->attr_data.reserved) return -EINVAL; if (uattr->len != 0) return -EINVAL; o_attr = &e->obj_attr; o_attr->attr_elm = attr_uapi; /* * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64 * here without caring about truncation as we know that the * IDR implementation today rejects negative IDs */ o_attr->uobject = uverbs_get_uobject_from_file( spec->u.obj.obj_type, spec->u.obj.access, uattr->data_s64, bundle); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { unsigned int uattr_idx = uattr - pbundle->uattrs; s64 id = o_attr->uobject->id; /* Copy the allocated id to the user-space */ if (put_user(id, &pbundle->user_attrs[uattr_idx].data)) return -EFAULT; } break; case UVERBS_ATTR_TYPE_RAW_FD: if (uattr->attr_data.reserved || uattr->len != 0 || uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX) return -EINVAL; /* _uverbs_get_const_signed() is the accessor */ e->ptr_attr.data = uattr->data_s64; break; case UVERBS_ATTR_TYPE_IDRS_ARRAY: return uverbs_process_idrs_array(pbundle, attr_uapi, &e->objs_arr_attr, uattr, attr_bkey); default: return -EOPNOTSUPP; } return 0; } /* * We search the radix tree with the method prefix and now we want to fast * search the suffix bits to get a particular attribute pointer. It is not * totally clear to me if this breaks the radix tree encasulation or not, but * it uses the iter data to determine if the method iter points at the same * chunk that will store the attribute, if so it just derefs it directly. By * construction in most kernel configs the method and attrs will all fit in a * single radix chunk, so in most cases this will have no search. Other cases * this falls back to a full search. */ static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle, u32 attr_key) { void __rcu **slot; if (likely(attr_key < pbundle->radix_slots_len)) { void *entry; slot = pbundle->radix_slots + attr_key; entry = rcu_dereference_raw(*slot); if (likely(!radix_tree_is_internal_node(entry) && entry)) return slot; } return radix_tree_lookup_slot(pbundle->radix, pbundle->method_key | attr_key); } static int uverbs_set_attr(struct bundle_priv *pbundle, struct ib_uverbs_attr *uattr) { u32 attr_key = uapi_key_attr(uattr->attr_id); u32 attr_bkey = uapi_bkey_attr(attr_key); const struct uverbs_api_attr *attr; void __rcu **slot; int ret; slot = uapi_get_attr_for_method(pbundle, attr_key); if (!slot) { /* * Kernel does not support the attribute but user-space says it * is mandatory */ if (uattr->flags & UVERBS_ATTR_F_MANDATORY) return -EPROTONOSUPPORT; return 0; } attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) return -EINVAL; ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey); if (ret) return ret; __set_bit(attr_bkey, pbundle->bundle.attr_present); return 0; } static int ib_uverbs_run_method(struct bundle_priv *pbundle, unsigned int num_attrs) { int (*handler)(struct uverbs_attr_bundle *attrs); struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; unsigned int i; int ret; /* See uverbs_disassociate_api() */ handler = srcu_dereference( pbundle->method_elm->handler, &pbundle->bundle.ufile->device->disassociate_srcu); if (!handler) return -EIO; pbundle->uattrs = uverbs_alloc(bundle, uattrs_size); if (IS_ERR(pbundle->uattrs)) return PTR_ERR(pbundle->uattrs); if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size)) return -EFAULT; for (i = 0; i != num_attrs; i++) { ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]); if (unlikely(ret)) return ret; } /* User space did not provide all the mandatory attributes */ if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, pbundle->bundle.attr_present, pbundle->method_elm->key_bitmap_len))) return -EINVAL; if (pbundle->method_elm->has_udata) uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); else pbundle->bundle.driver_udata = (struct ib_udata){}; if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { struct uverbs_obj_attr *destroy_attr = &bundle->attrs[destroy_bkey].obj_attr; ret = uobj_destroy(destroy_attr->uobject, bundle); if (ret) return ret; __clear_bit(destroy_bkey, pbundle->uobj_finalize); ret = handler(bundle); uobj_put_destroy(destroy_attr->uobject); } else { ret = handler(bundle); } /* * Until the drivers are revised to use the bundle directly we have to * assume that the driver wrote to its UHW_OUT and flag userspace * appropriately. */ if (!ret && pbundle->method_elm->has_udata) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); if (!IS_ERR(attr)) ret = uverbs_set_output(bundle, attr); } /* * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can * not invoke the method because the request is not supported. No * other cases should return this code. */ if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT)) return -EINVAL; return ret; } static void bundle_destroy(struct bundle_priv *pbundle, bool commit) { unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct bundle_alloc_head *memblock; unsigned int i; /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { struct uverbs_attr *attr = &bundle->attrs[i]; uverbs_finalize_object( attr->obj_attr.uobject, attr->obj_attr.attr_elm->spec.u.obj.access, test_bit(i, pbundle->uobj_hw_obj_valid), commit, bundle); } i = -1; while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { struct uverbs_attr *attr = &bundle->attrs[i]; const struct uverbs_api_attr *attr_uapi; void __rcu **slot; slot = uapi_get_attr_for_method( pbundle, pbundle->method_key | uapi_bkey_to_key_attr(i)); if (WARN_ON(!slot)) continue; attr_uapi = rcu_dereference_protected(*slot, true); if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr, commit, bundle); } } for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; memblock = memblock->next; kvfree(tmp); } } static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, struct ib_uverbs_ioctl_hdr *hdr, struct ib_uverbs_attr __user *user_attrs) { const struct uverbs_api_ioctl_method *method_elm; struct uverbs_api *uapi = ufile->device->uapi; struct radix_tree_iter attrs_iter; struct bundle_priv *pbundle; struct bundle_priv onstack; void __rcu **slot; int ret; if (unlikely(hdr->driver_id != uapi->driver_id)) return -EINVAL; slot = radix_tree_iter_lookup( &uapi->radix, &attrs_iter, uapi_key_obj(hdr->object_id) | uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); if (!pbundle) return -ENOMEM; pbundle->internal_avail = method_elm->bundle_size - offsetof(struct bundle_priv, internal_buffer); pbundle->alloc_head.next = NULL; pbundle->allocated_mem = container_of(&pbundle->alloc_head, struct bundle_alloc_head, hdr); } else { pbundle = &onstack; pbundle->internal_avail = sizeof(pbundle->internal_buffer); pbundle->allocated_mem = NULL; } /* Space for the pbundle->bundle.attrs flex array */ pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); pbundle->user_attrs = user_attrs; pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * sizeof(*container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr)->attrs), sizeof(*pbundle->internal_buffer)); memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); memset(pbundle->uobj_hw_obj_valid, 0, sizeof(pbundle->uobj_hw_obj_valid)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); bundle_destroy(pbundle, ret == 0); return ret; } long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_ioctl_hdr __user *user_hdr = (struct ib_uverbs_ioctl_hdr __user *)arg; struct ib_uverbs_ioctl_hdr hdr; int srcu_key; int err; if (unlikely(cmd != RDMA_VERBS_IOCTL)) return -ENOIOCTLCMD; err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); if (err) return -EFAULT; if (hdr.length > PAGE_SIZE || hdr.length != struct_size(&hdr, attrs, hdr.num_attrs)) return -EINVAL; if (hdr.reserved1 || hdr.reserved2) return -EPROTONOSUPPORT; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return err; } int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 allowed_bits) { const struct uverbs_attr *attr; u64 flags; attr = uverbs_attr_get(attrs_bundle, idx); /* Missing attribute means 0 flags */ if (IS_ERR(attr)) { *to = 0; return 0; } /* * New userspace code should use 8 bytes to pass flags, but we * transparently support old userspaces that were using 4 bytes as * well. */ if (attr->ptr_attr.len == 8) flags = attr->ptr_attr.data; else if (attr->ptr_attr.len == 4) flags = *(u32 *)&attr->ptr_attr.data; else return -EINVAL; if (flags & ~allowed_bits) return -EINVAL; *to = flags; return 0; } EXPORT_SYMBOL(uverbs_get_flags64); int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 allowed_bits) { u64 flags; int ret; ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits); if (ret) return ret; if (flags > U32_MAX) return -EINVAL; *to = flags; return 0; } EXPORT_SYMBOL(uverbs_get_flags32); /* * Fill a ib_udata struct (core or uhw) using the given attribute IDs. * This is primarily used to convert the UVERBS_ATTR_UHW() into the * ib_udata format used by the drivers. */ void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata, unsigned int attr_in, unsigned int attr_out) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); struct uverbs_attr_bundle *bundle_aux = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr *in = uverbs_attr_get(bundle_aux, attr_in); const struct uverbs_attr *out = uverbs_attr_get(bundle_aux, attr_out); if (!IS_ERR(in)) { udata->inlen = in->ptr_attr.len; if (uverbs_attr_ptr_is_inline(in)) udata->inbuf = &pbundle->user_attrs[in->ptr_attr.uattr_idx] .data; else udata->inbuf = u64_to_user_ptr(in->ptr_attr.data); } else { udata->inbuf = NULL; udata->inlen = 0; } if (!IS_ERR(out)) { udata->outbuf = u64_to_user_ptr(out->ptr_attr.data); udata->outlen = out->ptr_attr.len; } else { udata->outbuf = NULL; udata->outlen = 0; } } int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); size_t min_size; if (IS_ERR(attr)) return PTR_ERR(attr); min_size = min_t(size_t, attr->ptr_attr.len, size); if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size)) return -EFAULT; return uverbs_set_output(bundle, attr); } EXPORT_SYMBOL(uverbs_copy_to); /* * This is only used if the caller has directly used copy_to_use to write the * data. It signals to user space that the buffer is filled in. */ int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); if (IS_ERR(attr)) return PTR_ERR(attr); return uverbs_set_output(bundle, attr); } int _uverbs_get_const_signed(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, s64 lower_bound, u64 upper_bound, s64 *def_val) { const struct uverbs_attr *attr; attr = uverbs_attr_get(attrs_bundle, idx); if (IS_ERR(attr)) { if ((PTR_ERR(attr) != -ENOENT) || !def_val) return PTR_ERR(attr); *to = *def_val; } else { *to = attr->ptr_attr.data; } if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) return -EINVAL; return 0; } EXPORT_SYMBOL(_uverbs_get_const_signed); int _uverbs_get_const_unsigned(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 upper_bound, u64 *def_val) { const struct uverbs_attr *attr; attr = uverbs_attr_get(attrs_bundle, idx); if (IS_ERR(attr)) { if ((PTR_ERR(attr) != -ENOENT) || !def_val) return PTR_ERR(attr); *to = *def_val; } else { *to = attr->ptr_attr.data; } if (*to > upper_bound) return -EINVAL; return 0; } EXPORT_SYMBOL(_uverbs_get_const_unsigned); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); if (IS_ERR(attr)) return PTR_ERR(attr); if (size < attr->ptr_attr.len) { if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size, attr->ptr_attr.len - size)) return -EFAULT; } return uverbs_copy_to(bundle, idx, from, size); } EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); /* Once called an abort will call through to the type's destroy_hw() */ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, u16 idx) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), pbundle->uobj_hw_obj_valid); } EXPORT_SYMBOL(uverbs_finalize_uobj_create);
6 6 5 6 6 6 6 6 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else if (unlikely(next_offset == size && de->name_len == 1 && de->name[0] == '.')) error_msg = "'.' directory cannot be the last in data block"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); struct dir_private_info *info = file->private_data; err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk << inode->i_blkbits >> PAGE_SHIFT; if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_mapping, &file->f_ra, file, index, 1 << EXT4_SB(sb)->s_min_folio_order); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, info->cookie)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; info->cookie = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); u32 hash; u32 minor_hash; if (IS_CASEFOLDED(inode)) { hash = EXT4_DIRENT_HASH(de); minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hash = 0; minor_hash = 0; } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; struct dir_private_info *info = file->private_data; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); info->cookie = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[] __counted_by(name_len); }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p = filp->private_data; if (is_dx_dir(file_inode(filp)) && !p->initialized) { p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); p->initialized = true; } } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ new_fn = kzalloc_flex(*new_fn, name, ent_name->len + 1); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; ext4_htree_init_dir_info(file, ctx->pos); if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, info->cookie)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); info->cookie = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } static int ext4_dir_open(struct inode *inode, struct file *file) { struct dir_private_info *info; info = kzalloc_obj(*info); if (!info) return -ENOMEM; file->private_data = info; return 0; } const struct file_operations ext4_dir_operations = { .open = ext4_dir_open, .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, .setlease = generic_setlease, };
22 12 16 3 2 7 22 11 17 14 16 9 5 2 14 14 16 19 19 15 3 16 18 1 9 4 2 2 8 4 4 16 16 9 5 10 3 10 7 2 7 61 59 59 45 13 45 45 14 14 1 9 4 11 2 11 2 10 3 5 8 10 3 12 12 2 149 174 2 1 1 1 172 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst_dev(dst); if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst_dev(skb); err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; __set_bit(IP_TUNNEL_VTI_BIT, flags); ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { ip_tunnel_flags_from_be16(flags, GRE_KEY); ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
15 14 16 16 15 16 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/security.h> #include <linux/completion.h> #include <linux/list.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include "core_priv.h" #include "mad_priv.h" static LIST_HEAD(mad_agent_list); /* Lock to protect mad_agent_list */ static DEFINE_SPINLOCK(mad_agent_list_lock); static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; struct pkey_index_qp_list *tmp_pkey; struct ib_device *dev = pp->sec->dev; spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { pkey = tmp_pkey; break; } } spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); return pkey; } static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp, u16 *pkey, u64 *subnet_prefix) { struct ib_device *dev = pp->sec->dev; int ret; ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey); if (ret) return ret; ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); return ret; } static int enforce_qp_pkey_security(u16 pkey, u64 subnet_prefix, struct ib_qp_security *qp_sec) { struct ib_qp_security *shared_qp_sec; int ret; ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey); if (ret) return ret; list_for_each_entry(shared_qp_sec, &qp_sec->shared_qp_list, shared_qp_list) { ret = security_ib_pkey_access(shared_qp_sec->security, subnet_prefix, pkey); if (ret) return ret; } return 0; } /* The caller of this function must hold the QP security * mutex of the QP of the security structure in *pps. * * It takes separate ports_pkeys and security structure * because in some cases the pps will be for a new settings * or the pps will be for the real QP and security structure * will be for a shared QP. */ static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps, struct ib_qp_security *sec) { u64 subnet_prefix; u16 pkey; int ret = 0; if (!pps) return 0; if (pps->main.state != IB_PORT_PKEY_NOT_VALID) { ret = get_pkey_and_subnet_prefix(&pps->main, &pkey, &subnet_prefix); if (ret) return ret; ret = enforce_qp_pkey_security(pkey, subnet_prefix, sec); if (ret) return ret; } if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) { ret = get_pkey_and_subnet_prefix(&pps->alt, &pkey, &subnet_prefix); if (ret) return ret; ret = enforce_qp_pkey_security(pkey, subnet_prefix, sec); } return ret; } /* The caller of this function must hold the QP security * mutex. */ static void qp_to_error(struct ib_qp_security *sec) { struct ib_qp_security *shared_qp_sec; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_event event = { .event = IB_EVENT_QP_FATAL }; /* If the QP is in the process of being destroyed * the qp pointer in the security structure is * undefined. It cannot be modified now. */ if (sec->destroying) return; ib_modify_qp(sec->qp, &attr, IB_QP_STATE); if (sec->qp->event_handler && sec->qp->qp_context) { event.element.qp = sec->qp; sec->qp->event_handler(&event, sec->qp->qp_context); } list_for_each_entry(shared_qp_sec, &sec->shared_qp_list, shared_qp_list) { struct ib_qp *qp = shared_qp_sec->qp; if (qp->event_handler && qp->qp_context) { event.element.qp = qp; event.device = qp->device; qp->event_handler(&event, qp->qp_context); } } } static inline void check_pkey_qps(struct pkey_index_qp_list *pkey, struct ib_device *device, u32 port_num, u64 subnet_prefix) { struct ib_port_pkey *pp, *tmp_pp; bool comp; LIST_HEAD(to_error_list); u16 pkey_val; if (!ib_get_cached_pkey(device, port_num, pkey->pkey_index, &pkey_val)) { spin_lock(&pkey->qp_list_lock); list_for_each_entry(pp, &pkey->qp_list, qp_list) { if (atomic_read(&pp->sec->error_list_count)) continue; if (enforce_qp_pkey_security(pkey_val, subnet_prefix, pp->sec)) { atomic_inc(&pp->sec->error_list_count); list_add(&pp->to_error_list, &to_error_list); } } spin_unlock(&pkey->qp_list_lock); } list_for_each_entry_safe(pp, tmp_pp, &to_error_list, to_error_list) { mutex_lock(&pp->sec->mutex); qp_to_error(pp->sec); list_del(&pp->to_error_list); atomic_dec(&pp->sec->error_list_count); comp = pp->sec->destroying; mutex_unlock(&pp->sec->mutex); if (comp) complete(&pp->sec->error_complete); } } /* The caller of this function must hold the QP security * mutex. */ static int port_pkey_list_insert(struct ib_port_pkey *pp) { struct pkey_index_qp_list *tmp_pkey; struct pkey_index_qp_list *pkey; struct ib_device *dev; u32 port_num = pp->port_num; int ret = 0; if (pp->state != IB_PORT_PKEY_VALID) return 0; dev = pp->sec->dev; pkey = get_pkey_idx_qp_list(pp); if (!pkey) { bool found = false; pkey = kzalloc_obj(*pkey); if (!pkey) return -ENOMEM; spin_lock(&dev->port_data[port_num].pkey_list_lock); /* Check for the PKey again. A racing process may * have created it. */ list_for_each_entry(tmp_pkey, &dev->port_data[port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { kfree(pkey); pkey = tmp_pkey; found = true; break; } } if (!found) { pkey->pkey_index = pp->pkey_index; spin_lock_init(&pkey->qp_list_lock); INIT_LIST_HEAD(&pkey->qp_list); list_add(&pkey->pkey_index_list, &dev->port_data[port_num].pkey_list); } spin_unlock(&dev->port_data[port_num].pkey_list_lock); } spin_lock(&pkey->qp_list_lock); list_add(&pp->qp_list, &pkey->qp_list); spin_unlock(&pkey->qp_list_lock); pp->state = IB_PORT_PKEY_LISTED; return ret; } /* The caller of this function must hold the QP security * mutex. */ static void port_pkey_list_remove(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey; if (pp->state != IB_PORT_PKEY_LISTED) return; pkey = get_pkey_idx_qp_list(pp); spin_lock(&pkey->qp_list_lock); list_del(&pp->qp_list); spin_unlock(&pkey->qp_list_lock); /* The setting may still be valid, i.e. after * a destroy has failed for example. */ pp->state = IB_PORT_PKEY_VALID; } static void destroy_qp_security(struct ib_qp_security *sec) { security_ib_free_security(sec->security); kfree(sec->ports_pkeys); kfree(sec); } /* The caller of this function must hold the QP security * mutex. */ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, const struct ib_qp_attr *qp_attr, int qp_attr_mask) { struct ib_ports_pkeys *new_pps; struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys; new_pps = kzalloc_obj(*new_pps); if (!new_pps) return NULL; if (qp_attr_mask & IB_QP_PORT) new_pps->main.port_num = qp_attr->port_num; else if (qp_pps) new_pps->main.port_num = qp_pps->main.port_num; if (qp_attr_mask & IB_QP_PKEY_INDEX) new_pps->main.pkey_index = qp_attr->pkey_index; else if (qp_pps) new_pps->main.pkey_index = qp_pps->main.pkey_index; if (((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT)) || (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)) new_pps->main.state = IB_PORT_PKEY_VALID; if (qp_attr_mask & IB_QP_ALT_PATH) { new_pps->alt.port_num = qp_attr->alt_port_num; new_pps->alt.pkey_index = qp_attr->alt_pkey_index; new_pps->alt.state = IB_PORT_PKEY_VALID; } else if (qp_pps) { new_pps->alt.port_num = qp_pps->alt.port_num; new_pps->alt.pkey_index = qp_pps->alt.pkey_index; if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID) new_pps->alt.state = IB_PORT_PKEY_VALID; } new_pps->main.sec = qp->qp_sec; new_pps->alt.sec = qp->qp_sec; return new_pps; } int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev) { struct ib_qp *real_qp = qp->real_qp; int ret; ret = ib_create_qp_security(qp, dev); if (ret) return ret; if (!qp->qp_sec) return 0; mutex_lock(&real_qp->qp_sec->mutex); ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys, qp->qp_sec); if (ret) goto ret; if (qp != real_qp) list_add(&qp->qp_sec->shared_qp_list, &real_qp->qp_sec->shared_qp_list); ret: mutex_unlock(&real_qp->qp_sec->mutex); if (ret) destroy_qp_security(qp->qp_sec); return ret; } void ib_close_shared_qp_security(struct ib_qp_security *sec) { struct ib_qp *real_qp = sec->qp->real_qp; mutex_lock(&real_qp->qp_sec->mutex); list_del(&sec->shared_qp_list); mutex_unlock(&real_qp->qp_sec->mutex); destroy_qp_security(sec); } int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { unsigned int i; bool is_ib = false; int ret; rdma_for_each_port (dev, i) { is_ib = rdma_protocol_ib(dev, i); if (is_ib) break; } /* If this isn't an IB device don't create the security context */ if (!is_ib) return 0; qp->qp_sec = kzalloc_obj(*qp->qp_sec); if (!qp->qp_sec) return -ENOMEM; qp->qp_sec->qp = qp; qp->qp_sec->dev = dev; mutex_init(&qp->qp_sec->mutex); INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list); atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); if (ret) { kfree(qp->qp_sec); qp->qp_sec = NULL; } return ret; } EXPORT_SYMBOL(ib_create_qp_security); void ib_destroy_qp_security_begin(struct ib_qp_security *sec) { /* Return if not IB */ if (!sec) return; mutex_lock(&sec->mutex); /* Remove the QP from the lists so it won't get added to * a to_error_list during the destroy process. */ if (sec->ports_pkeys) { port_pkey_list_remove(&sec->ports_pkeys->main); port_pkey_list_remove(&sec->ports_pkeys->alt); } /* If the QP is already in one or more of those lists * the destroying flag will ensure the to error flow * doesn't operate on an undefined QP. */ sec->destroying = true; /* Record the error list count to know how many completions * to wait for. */ sec->error_comps_pending = atomic_read(&sec->error_list_count); mutex_unlock(&sec->mutex); } void ib_destroy_qp_security_abort(struct ib_qp_security *sec) { int ret; int i; /* Return if not IB */ if (!sec) return; /* If a concurrent cache update is in progress this * QP security could be marked for an error state * transition. Wait for this to complete. */ for (i = 0; i < sec->error_comps_pending; i++) wait_for_completion(&sec->error_complete); mutex_lock(&sec->mutex); sec->destroying = false; /* Restore the position in the lists and verify * access is still allowed in case a cache update * occurred while attempting to destroy. * * Because these setting were listed already * and removed during ib_destroy_qp_security_begin * we know the pkey_index_qp_list for the PKey * already exists so port_pkey_list_insert won't fail. */ if (sec->ports_pkeys) { port_pkey_list_insert(&sec->ports_pkeys->main); port_pkey_list_insert(&sec->ports_pkeys->alt); } ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec); if (ret) qp_to_error(sec); mutex_unlock(&sec->mutex); } void ib_destroy_qp_security_end(struct ib_qp_security *sec) { int i; /* Return if not IB */ if (!sec) return; /* If a concurrent cache update is occurring we must * wait until this QP security structure is processed * in the QP to error flow before destroying it because * the to_error_list is in use. */ for (i = 0; i < sec->error_comps_pending; i++) wait_for_completion(&sec->error_complete); destroy_qp_security(sec); } void ib_security_cache_change(struct ib_device *device, u32 port_num, u64 subnet_prefix) { struct pkey_index_qp_list *pkey; list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, pkey_index_list) { check_pkey_qps(pkey, device, port_num, subnet_prefix); } } void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; unsigned int i; rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, &device->port_data[i].pkey_list, pkey_index_list) { list_del(&pkey->pkey_index_list); kfree(pkey); } } } int ib_security_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata) { int ret = 0; struct ib_ports_pkeys *tmp_pps; struct ib_ports_pkeys *new_pps = NULL; struct ib_qp *real_qp = qp->real_qp; bool special_qp = (real_qp->qp_type == IB_QPT_SMI || real_qp->qp_type == IB_QPT_GSI || real_qp->qp_type >= IB_QPT_RESERVED1); bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) || (qp_attr_mask & IB_QP_ALT_PATH)); WARN_ONCE((qp_attr_mask & IB_QP_PORT && rdma_protocol_ib(real_qp->device, qp_attr->port_num) && !real_qp->qp_sec), "%s: QP security is not initialized for IB QP: %u\n", __func__, real_qp->qp_num); /* The port/pkey settings are maintained only for the real QP. Open * handles on the real QP will be in the shared_qp_list. When * enforcing security on the real QP all the shared QPs will be * checked as well. */ if (pps_change && !special_qp && real_qp->qp_sec) { mutex_lock(&real_qp->qp_sec->mutex); new_pps = get_new_pps(real_qp, qp_attr, qp_attr_mask); if (!new_pps) { mutex_unlock(&real_qp->qp_sec->mutex); return -ENOMEM; } /* Add this QP to the lists for the new port * and pkey settings before checking for permission * in case there is a concurrent cache update * occurring. Walking the list for a cache change * doesn't acquire the security mutex unless it's * sending the QP to error. */ ret = port_pkey_list_insert(&new_pps->main); if (!ret) ret = port_pkey_list_insert(&new_pps->alt); if (!ret) ret = check_qp_port_pkey_settings(new_pps, real_qp->qp_sec); } if (!ret) ret = real_qp->device->ops.modify_qp(real_qp, qp_attr, qp_attr_mask, udata); if (new_pps) { /* Clean up the lists and free the appropriate * ports_pkeys structure. */ if (ret) { tmp_pps = new_pps; } else { tmp_pps = real_qp->qp_sec->ports_pkeys; real_qp->qp_sec->ports_pkeys = new_pps; } if (tmp_pps) { port_pkey_list_remove(&tmp_pps->main); port_pkey_list_remove(&tmp_pps->alt); } kfree(tmp_pps); mutex_unlock(&real_qp->qp_sec->mutex); } return ret; } static int ib_security_pkey_access(struct ib_device *dev, u32 port_num, u16 pkey_index, void *sec) { u64 subnet_prefix; u16 pkey; int ret; if (!rdma_protocol_ib(dev, port_num)) return 0; ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey); if (ret) return ret; ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); return security_ib_pkey_access(sec, subnet_prefix, pkey); } void ib_mad_agent_security_change(void) { struct ib_mad_agent *ag; spin_lock(&mad_agent_list_lock); list_for_each_entry(ag, &mad_agent_list, mad_agent_sec_list) WRITE_ONCE(ag->smp_allowed, !security_ib_endport_manage_subnet(ag->security, dev_name(&ag->device->dev), ag->port_num)); spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type) { int ret; if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; INIT_LIST_HEAD(&agent->mad_agent_sec_list); ret = security_ib_alloc_security(&agent->security); if (ret) return ret; if (qp_type != IB_QPT_SMI) return 0; spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) goto free_security; WRITE_ONCE(agent->smp_allowed, true); list_add(&agent->mad_agent_sec_list, &mad_agent_list); spin_unlock(&mad_agent_list_lock); return 0; free_security: spin_unlock(&mad_agent_list_lock); security_ib_free_security(agent->security); return ret; } void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) { if (!rdma_protocol_ib(agent->device, agent->port_num)) return; if (agent->qp->qp_type == IB_QPT_SMI) { spin_lock(&mad_agent_list_lock); list_del(&agent->mad_agent_sec_list); spin_unlock(&mad_agent_list_lock); } security_ib_free_security(agent->security); } int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) { if (!rdma_protocol_ib(map->agent.device, map->agent.port_num)) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } return ib_security_pkey_access(map->agent.device, map->agent.port_num, pkey_index, map->agent.security); }
27 27 26 27 27 35 37 23 10 10 3 2 15 10 10 27 27 27 11 16 4 1 17 9 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 // SPDX-License-Identifier: GPL-2.0 /* * xfrm_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/bottom_half.h> #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/dst_metadata.h> #include <net/hotdata.h> #include "xfrm_inout.h" struct xfrm_trans_tasklet { struct work_struct work; spinlock_t queue_lock; struct sk_buff_head queue; }; struct xfrm_trans_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device *xfrm_napi_dev; static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; if (WARN_ON(afinfo->family > AF_INET6)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_input_register_afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; spin_lock_bh(&xfrm_input_afinfo_lock); if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) err = -EINVAL; else RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); } spin_unlock_bh(&xfrm_input_afinfo_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip) { const struct xfrm_input_afinfo *afinfo; if (WARN_ON_ONCE(family > AF_INET6)) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6); const struct xfrm_input_afinfo *afinfo; int ret; afinfo = xfrm_input_get_afinfo(family, is_ipip); if (!afinfo) return -EAFNOSUPPORT; ret = afinfo->callback(skb, protocol, err); rcu_read_unlock(); return ret; } struct sec_path *secpath_set(struct sk_buff *skb) { struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; if (tmp) /* reused existing one (was COW'd if needed) */ return sp; /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; sp->verified_cnt = 0; return sp; } EXPORT_SYMBOL(secpath_set); /* Fetch spi and seq from ipsec header */ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) { int offset, offset_seq; int hlen; switch (nexthdr) { case IPPROTO_AH: hlen = sizeof(struct ip_auth_hdr); offset = offsetof(struct ip_auth_hdr, spi); offset_seq = offsetof(struct ip_auth_hdr, seq_no); break; case IPPROTO_ESP: hlen = sizeof(struct ip_esp_hdr); offset = offsetof(struct ip_esp_hdr, spi); offset_seq = offsetof(struct ip_esp_hdr, seq_no); break; case IPPROTO_COMP: if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) return -EINVAL; *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2))); *seq = 0; return 0; default: return 1; } if (!pskb_may_pull(skb, hlen)) return -EINVAL; *spi = *(__be32 *)(skb_transport_header(skb) + offset); *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); return 0; } EXPORT_SYMBOL(xfrm_parse_spi); static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph; int optlen = 0; int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; int phlen; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; ph = (struct ip_beet_phdr *)skb->data; phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; if (!pskb_may_pull(skb, phlen)) goto out; __skb_pull(skb, phlen); } skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); iph = ip_hdr(skb); iph->ihl += optlen / 4; iph->tot_len = htons(skb->len); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; out: return err; } static void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP_ECN_set_ce(inner_iph); } static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static void ipip6_ecn_decapsulate(struct sk_buff *skb) { struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP6_ECN_set_ce(skb, inner_iph); } static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IPV6); if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; int size = sizeof(struct ipv6hdr); int err; skb->protocol = htons(ETH_P_IPV6); err = skb_cow_head(skb, size + skb->mac_len); if (err) goto out; __skb_push(skb, size); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); ip6h->daddr = x->sel.daddr.in6; ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation * header. * * On entry, the transport header shall point to where the IP header * should be and the network header shall be set to where the IP * header currently is. skb->data shall point to the start of the * payload. */ static int xfrm_inner_mode_encap_remove(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: switch (x->sel.family) { case AF_INET: return xfrm4_remove_beet_encap(x, skb); case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; case XFRM_MODE_TUNNEL: switch (XFRM_MODE_SKB_CB(skb)->protocol) { case IPPROTO_IPIP: return xfrm4_remove_tunnel_encap(x, skb); case IPPROTO_IPV6: return xfrm6_remove_tunnel_encap(x, skb); break; } return -EINVAL; } WARN_ON_ONCE(1); return -EOPNOTSUPP; } static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.family) { case AF_INET: xfrm4_extract_header(skb); break; case AF_INET6: xfrm6_extract_header(skb); break; default: WARN_ON_ONCE(1); return -EAFNOSUPPORT; } return xfrm_inner_mode_encap_remove(x, skb); } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation header. * * On entry, skb_transport_header() shall point to where the IP header * should be and skb_network_header() shall be set to where the IP header * currently is. skb->data shall point to the start of the payload. */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); if (xo) xo->orig_mac_len = skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); skb_reset_transport_header(skb); return 0; } static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); if (xo) xo->orig_mac_len = skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } static int xfrm_inner_mode_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: case XFRM_MODE_TUNNEL: return xfrm_prepare_input(x, skb); case XFRM_MODE_TRANSPORT: if (x->props.family == AF_INET) return xfrm4_transport_input(x, skb); if (x->props.family == AF_INET6) return xfrm6_transport_input(x, skb); break; case XFRM_MODE_ROUTEOPTIMIZATION: WARN_ON_ONCE(1); break; default: if (x->mode_cbs && x->mode_cbs->input) return x->mode_cbs->input(x, skb); WARN_ON_ONCE(1); break; } return -EOPNOTSUPP; } /* NOTE: encap_type - In addition to the normal (non-negative) values for * encap_type, a negative value of -1 or -2 can be used to resume/restart this * function after a previous invocation early terminated for async operation. */ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; struct net *net = dev_net(skb->dev); int err; __be32 seq; __be32 seq_hi; struct xfrm_state *x = NULL; xfrm_address_t *daddr; u32 mark = skb->mark; unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; bool crypto_done = false; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (encap_type < 0 || (xo && (xo->flags & XFRM_GRO || encap_type == 0 || encap_type == UDP_ENCAP_ESPINUDP))) { x = xfrm_input_state(skb); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); if (encap_type == -1) dev_put(skb->dev); goto drop; } family = x->props.family; /* An encap_type of -2 indicates reconstructed inner packet */ if (encap_type == -2) goto resume_decapped; /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; spin_lock(&x->lock); goto resume; } /* GRO call */ seq = XFRM_SPI_SKB_CB(skb)->seq; if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { if (xo->status & (CRYPTO_TRANSPORT_AH_AUTH_FAILED | CRYPTO_TRANSPORT_ESP_AUTH_FAILED | CRYPTO_TUNNEL_AH_AUTH_FAILED | CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } if (xo->status & CRYPTO_INVALID_PROTOCOL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } if (xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } nexthdr = x->type_offload->input_tail(x, skb); } goto process; } family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ switch (family) { case AF_INET: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); break; case AF_INET6: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); break; } sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } seq = 0; if (!spi && xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { sp = skb_sec_path(skb); if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; } if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); xfrm_audit_state_notfound(skb, family, spi, seq); xfrm_state_put(x); x = NULL; goto drop; } skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } process: seq_hi = htonl(xfrm_replay_seqhi(x, seq)); XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } if ((x->encap ? x->encap->encap_type : 0) != encap_type) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); goto drop_unlock; } if (xfrm_replay_check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } if (xfrm_state_check_expire(x)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED); goto drop_unlock; } if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop_unlock; } if (!crypto_done) { spin_unlock(&x->lock); dev_hold(skb->dev); nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; dev_put(skb->dev); spin_lock(&x->lock); } resume: if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop_unlock; } /* only the first xfrm gets the encap type */ encap_type = 0; if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } xfrm_replay_advance(x, seq); x->curlft.bytes += skb->len; x->curlft.packets++; x->lastused = ktime_get_real_seconds(); spin_unlock(&x->lock); XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; err = xfrm_inner_mode_input(x, skb); if (err == -EINPROGRESS) return 0; else if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } resume_decapped: if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; } /* * We need the inner address. However, we only get here for * transport mode so the outer address is identical. */ daddr = &x->id.daddr; family = x->props.family; err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); if (err < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); if (err) goto drop; nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; } else { xo = xfrm_offload(skb); if (xo) xfrm_gro = xo->flags & XFRM_GRO; err = -EAFNOSUPPORT; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(x->props.family); if (likely(afinfo)) err = afinfo->transport_finish(skb, xfrm_gro || async); rcu_read_unlock(); if (xfrm_gro) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; } return err; } drop_unlock: spin_unlock(&x->lock); drop: xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm_input); int xfrm_input_resume(struct sk_buff *skb, int nexthdr) { return xfrm_input(skb, nexthdr, 0, -1); } EXPORT_SYMBOL(xfrm_input_resume); static void xfrm_trans_reinject(struct work_struct *work) { struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work); struct sk_buff_head queue; struct sk_buff *skb; __skb_queue_head_init(&queue); spin_lock_bh(&trans->queue_lock); skb_queue_splice_init(&trans->queue, &queue); spin_unlock_bh(&trans->queue_lock); local_bh_disable(); while ((skb = __skb_dequeue(&queue))) XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, NULL, skb); local_bh_enable(); } int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; trans = this_cpu_ptr(&xfrm_trans_tasklet); if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); XFRM_TRANS_SKB_CB(skb)->finish = finish; XFRM_TRANS_SKB_CB(skb)->net = net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); schedule_work(&trans->work); return 0; } EXPORT_SYMBOL(xfrm_trans_queue_net); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); } EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) { int err; int i; xfrm_napi_dev = alloc_netdev_dummy(0); if (!xfrm_napi_dev) panic("Failed to allocate XFRM dummy netdev\n"); err = gro_cells_init(&gro_cells, xfrm_napi_dev); if (err) gro_cells.cells = NULL; for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; trans = &per_cpu(xfrm_trans_tasklet, i); spin_lock_init(&trans->queue_lock); __skb_queue_head_init(&trans->queue); INIT_WORK(&trans->work, xfrm_trans_reinject); } }
402 401 402 402 403 5 399 100 68 92 72 42 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ipv6.h> #include <linux/in6.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #endif #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone_id; else return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int err; #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (err == -EINPROGRESS) return NF_STOLEN; return err == 0 ? NF_ACCEPT : NF_DROP; } static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag6_net_exit(struct net *net) { if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); net->nf.defrag_ipv6_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv6_enable, .disable = nf_defrag_ipv6_disable, }; static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; static int __init nf_defrag_init(void) { int ret = 0; ret = nf_ct_frag6_init(); if (ret < 0) { pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); return ret; } ret = register_pernet_subsys(&defrag6_net_ops); if (ret < 0) { pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); return ret; cleanup_frag6: nf_ct_frag6_cleanup(); return ret; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } int nf_defrag_ipv6_enable(struct net *net) { int err = 0; mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users--; if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } mutex_unlock(&defrag6_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6 defragmentation support");
35 57 13 6 216 4 57 28 28 6 22 1 10 1 1 9 1 1 2 1 3 9 12 1 1 20 107 5 107 25 48 1 1 2 6 1 3 1 1 106 106 2 27 2 1 119 4 1 19 1 8 10 9 3 1 15 9 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018, 2020-2025 Intel Corporation */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 #if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __RDEV_OPS_TRACE #include <linux/tracepoint.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/cfg80211.h> #include "core.h" #define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN) #define MAC_ASSIGN(entry_mac, given_mac) do { \ if (given_mac) \ memcpy(__entry->entry_mac, given_mac, ETH_ALEN); \ else \ eth_zero_addr(__entry->entry_mac); \ } while (0) #define MAXNAME 32 #define WIPHY_ENTRY __array(char, wiphy_name, 32) #define WIPHY_ASSIGN strscpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME) #define WIPHY_PR_FMT "%s" #define WIPHY_PR_ARG __entry->wiphy_name #define WDEV_ENTRY __field(u32, id) #define WDEV_ASSIGN (__entry->id) = (!IS_ERR_OR_NULL(wdev) \ ? wdev->identifier : 0) #define WDEV_PR_FMT "wdev(%u)" #define WDEV_PR_ARG (__entry->id) #define NETDEV_ENTRY __array(char, name, IFNAMSIZ) \ __field(int, ifindex) #define NETDEV_ASSIGN \ do { \ memcpy(__entry->name, netdev->name, IFNAMSIZ); \ (__entry->ifindex) = (netdev->ifindex); \ } while (0) #define NETDEV_PR_FMT "netdev:%s(%d)" #define NETDEV_PR_ARG __entry->name, __entry->ifindex #define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout) \ __field(u16, dot11MeshConfirmTimeout) \ __field(u16, dot11MeshHoldingTimeout) \ __field(u16, dot11MeshMaxPeerLinks) \ __field(u8, dot11MeshMaxRetries) \ __field(u8, dot11MeshTTL) \ __field(u8, element_ttl) \ __field(bool, auto_open_plinks) \ __field(u32, dot11MeshNbrOffsetMaxNeighbor) \ __field(u8, dot11MeshHWMPmaxPREQretries) \ __field(u32, path_refresh_time) \ __field(u32, dot11MeshHWMPactivePathTimeout) \ __field(u16, min_discovery_timeout) \ __field(u16, dot11MeshHWMPpreqMinInterval) \ __field(u16, dot11MeshHWMPperrMinInterval) \ __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \ __field(u8, dot11MeshHWMPRootMode) \ __field(u16, dot11MeshHWMPRannInterval) \ __field(bool, dot11MeshGateAnnouncementProtocol) \ __field(bool, dot11MeshForwarding) \ __field(s32, rssi_threshold) \ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ __field(u16, dot11MeshHWMPconfirmationInterval) \ __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ __entry->dot11MeshConfirmTimeout = \ conf->dot11MeshConfirmTimeout; \ __entry->dot11MeshHoldingTimeout = \ conf->dot11MeshHoldingTimeout; \ __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \ __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries; \ __entry->dot11MeshTTL = conf->dot11MeshTTL; \ __entry->element_ttl = conf->element_ttl; \ __entry->auto_open_plinks = conf->auto_open_plinks; \ __entry->dot11MeshNbrOffsetMaxNeighbor = \ conf->dot11MeshNbrOffsetMaxNeighbor; \ __entry->dot11MeshHWMPmaxPREQretries = \ conf->dot11MeshHWMPmaxPREQretries; \ __entry->path_refresh_time = conf->path_refresh_time; \ __entry->dot11MeshHWMPactivePathTimeout = \ conf->dot11MeshHWMPactivePathTimeout; \ __entry->min_discovery_timeout = conf->min_discovery_timeout; \ __entry->dot11MeshHWMPpreqMinInterval = \ conf->dot11MeshHWMPpreqMinInterval; \ __entry->dot11MeshHWMPperrMinInterval = \ conf->dot11MeshHWMPperrMinInterval; \ __entry->dot11MeshHWMPnetDiameterTraversalTime = \ conf->dot11MeshHWMPnetDiameterTraversalTime; \ __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \ __entry->dot11MeshHWMPRannInterval = \ conf->dot11MeshHWMPRannInterval; \ __entry->dot11MeshGateAnnouncementProtocol = \ conf->dot11MeshGateAnnouncementProtocol; \ __entry->dot11MeshForwarding = conf->dot11MeshForwarding; \ __entry->rssi_threshold = conf->rssi_threshold; \ __entry->ht_opmode = conf->ht_opmode; \ __entry->dot11MeshHWMPactivePathToRootTimeout = \ conf->dot11MeshHWMPactivePathToRootTimeout; \ __entry->dot11MeshHWMProotInterval = \ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ __field(u32, center_freq) \ __field(u16, freq_offset) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ __entry->band = chan->band; \ __entry->center_freq = chan->center_freq; \ __entry->freq_offset = chan->freq_offset; \ } else { \ __entry->band = 0; \ __entry->center_freq = 0; \ __entry->freq_offset = 0; \ } \ } while (0) #define CHAN_PR_FMT "band: %d, freq: %u.%03u" #define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset #define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ __field(u32, freq_offset) \ __field(u32, width) \ __field(u32, center_freq1) \ __field(u32, freq1_offset) \ __field(u32, center_freq2) \ __field(u16, punctured) #define CHAN_DEF_ASSIGN(chandef) \ do { \ if ((chandef) && (chandef)->chan) { \ __entry->band = (chandef)->chan->band; \ __entry->control_freq = \ (chandef)->chan->center_freq; \ __entry->freq_offset = \ (chandef)->chan->freq_offset; \ __entry->width = (chandef)->width; \ __entry->center_freq1 = (chandef)->center_freq1;\ __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ __entry->punctured = (chandef)->punctured; \ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ __entry->freq_offset = 0; \ __entry->width = 0; \ __entry->center_freq1 = 0; \ __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ __entry->punctured = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u, punct: 0x%x" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ __entry->freq_offset, __entry->width, \ __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2, __entry->punctured #define FILS_AAD_ASSIGN(fa) \ do { \ if (fa) { \ ether_addr_copy(__entry->macaddr, fa->macaddr); \ __entry->kek_len = fa->kek_len; \ } else { \ eth_zero_addr(__entry->macaddr); \ __entry->kek_len = 0; \ } \ } while (0) #define FILS_AAD_PR_FMT \ "macaddr: %pM, kek_len: %d" #define SINFO_ENTRY __field(int, generation) \ __field(u32, connected_time) \ __field(u32, inactive_time) \ __field(u32, rx_bytes) \ __field(u32, tx_bytes) \ __field(u32, rx_packets) \ __field(u32, tx_packets) \ __field(u32, tx_retries) \ __field(u32, tx_failed) \ __field(u32, rx_dropped_misc) \ __field(u32, beacon_loss_count) \ __field(u16, llid) \ __field(u16, plid) \ __field(u8, plink_state) #define SINFO_ASSIGN \ do { \ __entry->generation = sinfo->generation; \ __entry->connected_time = sinfo->connected_time; \ __entry->inactive_time = sinfo->inactive_time; \ __entry->rx_bytes = sinfo->rx_bytes; \ __entry->tx_bytes = sinfo->tx_bytes; \ __entry->rx_packets = sinfo->rx_packets; \ __entry->tx_packets = sinfo->tx_packets; \ __entry->tx_retries = sinfo->tx_retries; \ __entry->tx_failed = sinfo->tx_failed; \ __entry->rx_dropped_misc = sinfo->rx_dropped_misc; \ __entry->beacon_loss_count = sinfo->beacon_loss_count; \ __entry->llid = sinfo->llid; \ __entry->plid = sinfo->plid; \ __entry->plink_state = sinfo->plink_state; \ } while (0) #define BOOL_TO_STR(bo) (bo) ? "true" : "false" #define QOS_MAP_ENTRY __field(u8, num_des) \ __array(u8, dscp_exception, \ 2 * IEEE80211_QOS_MAP_MAX_EX) \ __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN) #define QOS_MAP_ASSIGN(qos_map) \ do { \ if ((qos_map)) { \ __entry->num_des = (qos_map)->num_des; \ memcpy(__entry->dscp_exception, \ &(qos_map)->dscp_exception, \ 2 * IEEE80211_QOS_MAP_MAX_EX); \ memcpy(__entry->up, &(qos_map)->up, \ IEEE80211_QOS_MAP_LEN_MIN); \ } else { \ __entry->num_des = 0; \ memset(__entry->dscp_exception, 0, \ 2 * IEEE80211_QOS_MAP_MAX_EX); \ memset(__entry->up, 0, \ IEEE80211_QOS_MAP_LEN_MIN); \ } \ } while (0) /************************************************************* * wiphy work traces * *************************************************************/ DECLARE_EVENT_CLASS(wiphy_work_event, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), TP_ARGS(wiphy, work), TP_STRUCT__entry( WIPHY_ENTRY __field(void *, instance) __field(void *, func) ), TP_fast_assign( WIPHY_ASSIGN; __entry->instance = work; __entry->func = work ? work->func : NULL; ), TP_printk(WIPHY_PR_FMT " instance=%p func=%pS", WIPHY_PR_ARG, __entry->instance, __entry->func) ); DEFINE_EVENT(wiphy_work_event, wiphy_work_queue, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), TP_ARGS(wiphy, work) ); DEFINE_EVENT(wiphy_work_event, wiphy_work_run, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), TP_ARGS(wiphy, work) ); DEFINE_EVENT(wiphy_work_event, wiphy_work_cancel, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), TP_ARGS(wiphy, work) ); DEFINE_EVENT(wiphy_work_event, wiphy_work_flush, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work), TP_ARGS(wiphy, work) ); TRACE_EVENT(wiphy_delayed_work_queue, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, unsigned long delay), TP_ARGS(wiphy, work, delay), TP_STRUCT__entry( WIPHY_ENTRY __field(void *, instance) __field(void *, func) __field(unsigned long, delay) ), TP_fast_assign( WIPHY_ASSIGN; __entry->instance = work; __entry->func = work->func; __entry->delay = delay; ), TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%ld", WIPHY_PR_ARG, __entry->instance, __entry->func, __entry->delay) ); TRACE_EVENT(wiphy_hrtimer_work_queue, TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, ktime_t delay), TP_ARGS(wiphy, work, delay), TP_STRUCT__entry( WIPHY_ENTRY __field(void *, instance) __field(void *, func) __field(ktime_t, delay) ), TP_fast_assign( WIPHY_ASSIGN; __entry->instance = work; __entry->func = work->func; __entry->delay = delay; ), TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu", WIPHY_PR_ARG, __entry->instance, __entry->func, __entry->delay) ); TRACE_EVENT(wiphy_work_worker_start, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); /************************************************************* * rdev->ops traces * *************************************************************/ TRACE_EVENT(rdev_suspend, TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow), TP_ARGS(wiphy, wow), TP_STRUCT__entry( WIPHY_ENTRY __field(bool, any) __field(bool, disconnect) __field(bool, magic_pkt) __field(bool, gtk_rekey_failure) __field(bool, eap_identity_req) __field(bool, four_way_handshake) __field(bool, rfkill_release) __field(bool, valid_wow) ), TP_fast_assign( WIPHY_ASSIGN; if (wow) { __entry->any = wow->any; __entry->disconnect = wow->disconnect; __entry->magic_pkt = wow->magic_pkt; __entry->gtk_rekey_failure = wow->gtk_rekey_failure; __entry->eap_identity_req = wow->eap_identity_req; __entry->four_way_handshake = wow->four_way_handshake; __entry->rfkill_release = wow->rfkill_release; __entry->valid_wow = true; } else { __entry->valid_wow = false; } ), TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, " "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, " "four way handshake: %d, rfkill release: %d.", WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)", __entry->any, __entry->disconnect, __entry->magic_pkt, __entry->gtk_rekey_failure, __entry->eap_identity_req, __entry->four_way_handshake, __entry->rfkill_release) ); TRACE_EVENT(rdev_return_int, TP_PROTO(struct wiphy *wiphy, int ret), TP_ARGS(wiphy, ret), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; ), TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret) ); TRACE_EVENT(rdev_scan, TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request_int *request), TP_ARGS(wiphy, request), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); DECLARE_EVENT_CLASS(wiphy_only_evt, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); DEFINE_EVENT(wiphy_only_evt, rdev_resume, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); TRACE_EVENT(rdev_get_antenna, TP_PROTO(struct wiphy *wiphy, int radio_idx), TP_ARGS(wiphy, radio_idx), TP_STRUCT__entry( WIPHY_ENTRY __field(int, radio_idx) ), TP_fast_assign( WIPHY_ASSIGN; __entry->radio_idx = radio_idx; ), TP_printk(WIPHY_PR_FMT ", radio_idx: %d", WIPHY_PR_ARG, __entry->radio_idx) ); DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DECLARE_EVENT_CLASS(wiphy_enabled_evt, TP_PROTO(struct wiphy *wiphy, bool enabled), TP_ARGS(wiphy, enabled), TP_STRUCT__entry( WIPHY_ENTRY __field(bool, enabled) ), TP_fast_assign( WIPHY_ASSIGN; __entry->enabled = enabled; ), TP_printk(WIPHY_PR_FMT ", %senabled ", WIPHY_PR_ARG, __entry->enabled ? "" : "not ") ); DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup, TP_PROTO(struct wiphy *wiphy, bool enabled), TP_ARGS(wiphy, enabled) ); TRACE_EVENT(rdev_add_virtual_intf, TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type), TP_ARGS(wiphy, name, type), TP_STRUCT__entry( WIPHY_ENTRY __string(vir_intf_name, name ? name : "<noname>") __field(enum nl80211_iftype, type) ), TP_fast_assign( WIPHY_ASSIGN; __assign_str(vir_intf_name); __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d", WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type) ); DECLARE_EVENT_CLASS(wiphy_wdev_evt, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld", WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_change_virtual_intf, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, enum nl80211_iftype type), TP_ARGS(wiphy, netdev, type), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(enum nl80211_iftype, type) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type) ); DECLARE_EVENT_CLASS(key_handle, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) __field(int, link_id) __field(u8, key_index) __field(bool, pairwise) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key_index: %u, pairwise: %s, mac addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index, BOOL_TO_STR(__entry->pairwise), __entry->mac_addr) ); DEFINE_EVENT(key_handle, rdev_get_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) ); DEFINE_EVENT(key_handle, rdev_del_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr) ); TRACE_EVENT(rdev_add_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode), TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) __field(int, link_id) __field(u8, key_index) __field(bool, pairwise) __field(u8, mode) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->link_id = link_id; __entry->key_index = key_index; __entry->pairwise = pairwise; __entry->mode = mode; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key_index: %u, mode: %u, pairwise: %s, " "mac addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index, __entry->mode, BOOL_TO_STR(__entry->pairwise), __entry->mac_addr) ); TRACE_EVENT(rdev_set_default_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast), TP_ARGS(wiphy, netdev, link_id, key_index, unicast, multicast), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, link_id) __field(u8, key_index) __field(bool, unicast) __field(bool, multicast) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; __entry->key_index = key_index; __entry->unicast = unicast; __entry->multicast = multicast; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key index: %u, unicast: %s, multicast: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index, BOOL_TO_STR(__entry->unicast), BOOL_TO_STR(__entry->multicast)) ); TRACE_EVENT(rdev_set_default_mgmt_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index), TP_ARGS(wiphy, netdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; __entry->key_index = key_index; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index) ); TRACE_EVENT(rdev_set_default_beacon_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index), TP_ARGS(wiphy, netdev, link_id, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, link_id) __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; __entry->key_index = key_index; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, " "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->key_index) ); TRACE_EVENT(rdev_start_ap, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ap_settings *settings), TP_ARGS(wiphy, netdev, settings), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(int, beacon_interval) __field(int, dtim_period) __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1) __field(enum nl80211_hidden_ssid, hidden_ssid) __field(u32, wpa_ver) __field(bool, privacy) __field(enum nl80211_auth_type, auth_type) __field(int, inactivity_timeout) __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(&settings->chandef); __entry->beacon_interval = settings->beacon_interval; __entry->dtim_period = settings->dtim_period; __entry->hidden_ssid = settings->hidden_ssid; __entry->wpa_ver = settings->crypto.wpa_versions; __entry->privacy = settings->privacy; __entry->auth_type = settings->auth_type; __entry->inactivity_timeout = settings->inactivity_timeout; memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, settings->ssid, settings->ssid_len); __entry->link_id = settings->beacon.link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, " CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, " "hidden ssid: %d, wpa versions: %u, privacy: %s, " "auth type: %d, inactivity timeout: %d, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG, __entry->beacon_interval, __entry->dtim_period, __entry->hidden_ssid, __entry->wpa_ver, BOOL_TO_STR(__entry->privacy), __entry->auth_type, __entry->inactivity_timeout, __entry->link_id) ); TRACE_EVENT(rdev_change_beacon, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ap_update *info), TP_ARGS(wiphy, netdev, info), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, link_id) __dynamic_array(u8, head, info->beacon.head_len) __dynamic_array(u8, tail, info->beacon.tail_len) __dynamic_array(u8, beacon_ies, info->beacon.beacon_ies_len) __dynamic_array(u8, proberesp_ies, info->beacon.proberesp_ies_len) __dynamic_array(u8, assocresp_ies, info->beacon.assocresp_ies_len) __dynamic_array(u8, probe_resp, info->beacon.probe_resp_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = info->beacon.link_id; if (info->beacon.head) memcpy(__get_dynamic_array(head), info->beacon.head, info->beacon.head_len); if (info->beacon.tail) memcpy(__get_dynamic_array(tail), info->beacon.tail, info->beacon.tail_len); if (info->beacon.beacon_ies) memcpy(__get_dynamic_array(beacon_ies), info->beacon.beacon_ies, info->beacon.beacon_ies_len); if (info->beacon.proberesp_ies) memcpy(__get_dynamic_array(proberesp_ies), info->beacon.proberesp_ies, info->beacon.proberesp_ies_len); if (info->beacon.assocresp_ies) memcpy(__get_dynamic_array(assocresp_ies), info->beacon.assocresp_ies, info->beacon.assocresp_ies_len); if (info->beacon.probe_resp) memcpy(__get_dynamic_array(probe_resp), info->beacon.probe_resp, info->beacon.probe_resp_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) ); TRACE_EVENT(rdev_stop_ap, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, unsigned int link_id), TP_ARGS(wiphy, netdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) ); DECLARE_EVENT_CLASS(wiphy_netdev_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); TRACE_EVENT(rdev_end_cac, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, unsigned int link_id), TP_ARGS(wiphy, netdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) ); DECLARE_EVENT_CLASS(station_add_change, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) __field(u32, sta_flags_mask) __field(u32, sta_flags_set) __field(u32, sta_modify_mask) __field(int, listen_interval) __field(u16, capability) __field(u16, aid) __field(u8, plink_action) __field(u8, plink_state) __field(u8, uapsd_queues) __field(u8, max_sp) __field(u8, opmode_notif) __field(bool, opmode_notif_used) __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) __array(char, vlan, IFNAMSIZ) __dynamic_array(u8, supported_rates, params->link_sta_params.supported_rates_len) __dynamic_array(u8, ext_capab, params->ext_capab_len) __dynamic_array(u8, supported_channels, params->supported_channels_len) __dynamic_array(u8, supported_oper_classes, params->supported_oper_classes_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); __entry->sta_flags_mask = params->sta_flags_mask; __entry->sta_flags_set = params->sta_flags_set; __entry->sta_modify_mask = params->sta_modify_mask; __entry->listen_interval = params->listen_interval; __entry->aid = params->aid; __entry->plink_action = params->plink_action; __entry->plink_state = params->plink_state; __entry->uapsd_queues = params->uapsd_queues; memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap)); if (params->link_sta_params.ht_capa) memcpy(__entry->ht_capa, params->link_sta_params.ht_capa, sizeof(struct ieee80211_ht_cap)); memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); if (params->link_sta_params.vht_capa) memcpy(__entry->vht_capa, params->link_sta_params.vht_capa, sizeof(struct ieee80211_vht_cap)); memset(__entry->vlan, 0, sizeof(__entry->vlan)); if (params->vlan) memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ); if (params->link_sta_params.supported_rates && params->link_sta_params.supported_rates_len) memcpy(__get_dynamic_array(supported_rates), params->link_sta_params.supported_rates, params->link_sta_params.supported_rates_len); if (params->ext_capab && params->ext_capab_len) memcpy(__get_dynamic_array(ext_capab), params->ext_capab, params->ext_capab_len); if (params->supported_channels && params->supported_channels_len) memcpy(__get_dynamic_array(supported_channels), params->supported_channels, params->supported_channels_len); if (params->supported_oper_classes && params->supported_oper_classes_len) memcpy(__get_dynamic_array(supported_oper_classes), params->supported_oper_classes, params->supported_oper_classes_len); __entry->max_sp = params->max_sp; __entry->capability = params->capability; __entry->opmode_notif = params->link_sta_params.opmode_notif; __entry->opmode_notif_used = params->link_sta_params.opmode_notif_used; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", station flags mask: 0x%x, station flags set: 0x%x, " "station modify mask: 0x%x, listen interval: %d, aid: %u, " "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->sta_flags_mask, __entry->sta_flags_set, __entry->sta_modify_mask, __entry->listen_interval, __entry->aid, __entry->plink_action, __entry->plink_state, __entry->uapsd_queues, __entry->vlan) ); DEFINE_EVENT(station_add_change, rdev_add_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params) ); DEFINE_EVENT(station_add_change, rdev_change_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params) ); DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), TP_ARGS(wiphy, netdev, mac), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac) ); DECLARE_EVENT_CLASS(station_del, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct station_del_parameters *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) __field(u8, subtype) __field(u16, reason_code) __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, params->mac); __entry->subtype = params->subtype; __entry->reason_code = params->reason_code; __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", subtype: %u, reason_code: %u, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->subtype, __entry->reason_code, __entry->link_id) ); DEFINE_EVENT(station_del, rdev_del_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct station_del_parameters *params), TP_ARGS(wiphy, netdev, params) ); DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), TP_ARGS(wiphy, netdev, mac) ); DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), TP_ARGS(wiphy, netdev, mac) ); TRACE_EVENT(rdev_dump_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *mac), TP_ARGS(wiphy, netdev, _idx, mac), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) __field(int, idx) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM, idx: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac, __entry->idx) ); TRACE_EVENT(rdev_return_int_station_info, TP_PROTO(struct wiphy *wiphy, int ret, struct station_info *sinfo), TP_ARGS(wiphy, ret, sinfo), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) SINFO_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; SINFO_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", returned %d" , WIPHY_PR_ARG, __entry->ret) ); DECLARE_EVENT_CLASS(mpath_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst, u8 *next_hop), TP_ARGS(wiphy, netdev, dst, next_hop), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dst) MAC_ENTRY(next_hop) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(next_hop, next_hop); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM, next hop: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dst, __entry->next_hop) ); DEFINE_EVENT(mpath_evt, rdev_add_mpath, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst, u8 *next_hop), TP_ARGS(wiphy, netdev, dst, next_hop) ); DEFINE_EVENT(mpath_evt, rdev_change_mpath, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst, u8 *next_hop), TP_ARGS(wiphy, netdev, dst, next_hop) ); DEFINE_EVENT(mpath_evt, rdev_get_mpath, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst, u8 *next_hop), TP_ARGS(wiphy, netdev, dst, next_hop) ); TRACE_EVENT(rdev_dump_mpath, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *next_hop), TP_ARGS(wiphy, netdev, _idx, dst, next_hop), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dst) MAC_ENTRY(next_hop) __field(int, idx) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(next_hop, next_hop); __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, next hop: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst, __entry->next_hop) ); TRACE_EVENT(rdev_get_mpp, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst, u8 *mpp), TP_ARGS(wiphy, netdev, dst, mpp), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dst) MAC_ENTRY(mpp) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(mpp, mpp); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM" ", mpp: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dst, __entry->mpp) ); TRACE_EVENT(rdev_dump_mpp, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *mpp), TP_ARGS(wiphy, netdev, _idx, dst, mpp), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dst) MAC_ENTRY(mpp) __field(int, idx) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dst, dst); MAC_ASSIGN(mpp, mpp); __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, mpp: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst, __entry->mpp) ); TRACE_EVENT(rdev_return_int_mpath_info, TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo), TP_ARGS(wiphy, ret, pinfo), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) __field(int, generation) __field(u32, filled) __field(u32, frame_qlen) __field(u32, sn) __field(u32, metric) __field(u32, exptime) __field(u32, discovery_timeout) __field(u8, discovery_retries) __field(u8, flags) ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; __entry->generation = pinfo->generation; __entry->filled = pinfo->filled; __entry->frame_qlen = pinfo->frame_qlen; __entry->sn = pinfo->sn; __entry->metric = pinfo->metric; __entry->exptime = pinfo->exptime; __entry->discovery_timeout = pinfo->discovery_timeout; __entry->discovery_retries = pinfo->discovery_retries; __entry->flags = pinfo->flags; ), TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, " "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u," " discovery timeout: %u, discovery retries: %u, flags: 0x%x", WIPHY_PR_ARG, __entry->ret, __entry->generation, __entry->filled, __entry->frame_qlen, __entry->sn, __entry->metric, __entry->exptime, __entry->discovery_timeout, __entry->discovery_retries, __entry->flags) ); TRACE_EVENT(rdev_return_int_mesh_config, TP_PROTO(struct wiphy *wiphy, int ret, struct mesh_config *conf), TP_ARGS(wiphy, ret, conf), TP_STRUCT__entry( WIPHY_ENTRY MESH_CFG_ENTRY __field(int, ret) ), TP_fast_assign( WIPHY_ASSIGN; MESH_CFG_ASSIGN; __entry->ret = ret; ), TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret) ); TRACE_EVENT(rdev_update_mesh_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 mask, const struct mesh_config *conf), TP_ARGS(wiphy, netdev, mask, conf), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MESH_CFG_ENTRY __field(u32, mask) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MESH_CFG_ASSIGN; __entry->mask = mask; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mask: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mask) ); TRACE_EVENT(rdev_join_mesh, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const struct mesh_config *conf, const struct mesh_setup *setup), TP_ARGS(wiphy, netdev, conf, setup), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MESH_CFG_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MESH_CFG_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); TRACE_EVENT(rdev_change_bss, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct bss_parameters *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, use_cts_prot) __field(int, use_short_preamble) __field(int, use_short_slot_time) __field(int, ap_isolate) __field(int, ht_opmode) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->use_cts_prot = params->use_cts_prot; __entry->use_short_preamble = params->use_short_preamble; __entry->use_short_slot_time = params->use_short_slot_time; __entry->ap_isolate = params->ap_isolate; __entry->ht_opmode = params->ht_opmode; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", use cts prot: %d, " "use short preamble: %d, use short slot time: %d, " "ap isolate: %d, ht opmode: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->use_cts_prot, __entry->use_short_preamble, __entry->use_short_slot_time, __entry->ap_isolate, __entry->ht_opmode) ); TRACE_EVENT(rdev_inform_bss, TP_PROTO(struct wiphy *wiphy, struct cfg80211_bss *bss), TP_ARGS(wiphy, bss), TP_STRUCT__entry( WIPHY_ENTRY MAC_ENTRY(bssid) CHAN_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; MAC_ASSIGN(bssid, bss->bssid); CHAN_ASSIGN(bss->channel); ), TP_printk(WIPHY_PR_FMT ", %pM, " CHAN_PR_FMT, WIPHY_PR_ARG, __entry->bssid, CHAN_PR_ARG) ); TRACE_EVENT(rdev_set_txq_params, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct ieee80211_txq_params *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(enum nl80211_ac, ac) __field(u16, txop) __field(u16, cwmin) __field(u16, cwmax) __field(u8, aifs) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->ac = params->ac; __entry->txop = params->txop; __entry->cwmin = params->cwmin; __entry->cwmax = params->cwmax; __entry->aifs = params->aifs; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", ac: %d, txop: %u, cwmin: %u, cwmax: %u, aifs: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ac, __entry->txop, __entry->cwmin, __entry->cwmax, __entry->aifs) ); TRACE_EVENT(rdev_libertas_set_mesh_channel, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct ieee80211_channel *chan), TP_ARGS(wiphy, netdev, chan), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_ASSIGN(chan); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_PR_ARG) ); TRACE_EVENT(rdev_set_monitor_channel, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, netdev, chandef), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); TRACE_EVENT(rdev_auth, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_auth_request *req), TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __field(enum nl80211_auth_type, auth_type) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; if (req->bss) MAC_ASSIGN(bssid, req->bss->bssid); else eth_zero_addr(__entry->bssid); __entry->auth_type = req->auth_type; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type, __entry->bssid) ); TRACE_EVENT(rdev_assoc, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_assoc_request *req), TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) MAC_ENTRY(prev_bssid) __field(bool, use_mfp) __field(u32, flags) __dynamic_array(u8, elements, req->ie_len) __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap)) __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap)) __array(u8, vht_capa, sizeof(struct ieee80211_vht_cap)) __array(u8, vht_capa_mask, sizeof(struct ieee80211_vht_cap)) __dynamic_array(u8, fils_kek, req->fils_kek_len) __dynamic_array(u8, fils_nonces, req->fils_nonces ? 2 * FILS_NONCE_LEN : 0) __field(u16, ext_mld_capa_ops) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; if (req->bss) MAC_ASSIGN(bssid, req->bss->bssid); else eth_zero_addr(__entry->bssid); MAC_ASSIGN(prev_bssid, req->prev_bssid); __entry->use_mfp = req->use_mfp; __entry->flags = req->flags; if (req->ie) memcpy(__get_dynamic_array(elements), req->ie, req->ie_len); memcpy(__entry->ht_capa, &req->ht_capa, sizeof(req->ht_capa)); memcpy(__entry->ht_capa_mask, &req->ht_capa_mask, sizeof(req->ht_capa_mask)); memcpy(__entry->vht_capa, &req->vht_capa, sizeof(req->vht_capa)); memcpy(__entry->vht_capa_mask, &req->vht_capa_mask, sizeof(req->vht_capa_mask)); if (req->fils_kek) memcpy(__get_dynamic_array(fils_kek), req->fils_kek, req->fils_kek_len); if (req->fils_nonces) memcpy(__get_dynamic_array(fils_nonces), req->fils_nonces, 2 * FILS_NONCE_LEN); __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", previous bssid: %pM, use mfp: %s, flags: 0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->prev_bssid, BOOL_TO_STR(__entry->use_mfp), __entry->flags) ); TRACE_EVENT(rdev_deauth, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_deauth_request *req), TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __field(u16, reason_code) __field(bool, local_state_change) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, req->bssid); __entry->reason_code = req->reason_code; __entry->local_state_change = req->local_state_change; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, reason: %u, local_state_change:%d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->reason_code, __entry->local_state_change) ); TRACE_EVENT(rdev_disassoc, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_disassoc_request *req), TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __field(u16, reason_code) __field(bool, local_state_change) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, req->ap_addr); __entry->reason_code = req->reason_code; __entry->local_state_change = req->local_state_change; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", reason: %u, local state change: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->reason_code, BOOL_TO_STR(__entry->local_state_change)) ); TRACE_EVENT(rdev_mgmt_tx_cancel_wait, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu ", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie) ); TRACE_EVENT(rdev_set_power_mgmt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, bool enabled, int timeout), TP_ARGS(wiphy, netdev, enabled, timeout), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(bool, enabled) __field(int, timeout) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->enabled = enabled; __entry->timeout = timeout; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %senabled, timeout: %d ", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->enabled ? "" : "not ", __entry->timeout) ); TRACE_EVENT(rdev_connect, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme), TP_ARGS(wiphy, netdev, sme), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1) __field(enum nl80211_auth_type, auth_type) __field(bool, privacy) __field(u32, wpa_versions) __field(u32, flags) MAC_ENTRY(prev_bssid) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, sme->bssid); memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, sme->ssid, sme->ssid_len); __entry->auth_type = sme->auth_type; __entry->privacy = sme->privacy; __entry->wpa_versions = sme->crypto.wpa_versions; __entry->flags = sme->flags; MAC_ASSIGN(prev_bssid, sme->prev_bssid); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, " "flags: 0x%x, previous bssid: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid, __entry->auth_type, BOOL_TO_STR(__entry->privacy), __entry->wpa_versions, __entry->flags, __entry->prev_bssid) ); TRACE_EVENT(rdev_update_connect_params, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme, u32 changed), TP_ARGS(wiphy, netdev, sme, changed), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u32, changed) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->changed = changed; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->changed) ); TRACE_EVENT(rdev_set_cqm_rssi_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, s32 rssi_thold, u32 rssi_hyst), TP_ARGS(wiphy, netdev, rssi_thold, rssi_hyst), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(s32, rssi_thold) __field(u32, rssi_hyst) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->rssi_thold = rssi_thold; __entry->rssi_hyst = rssi_hyst; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rssi_thold: %d, rssi_hyst: %u ", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rssi_thold, __entry->rssi_hyst) ); TRACE_EVENT(rdev_set_cqm_rssi_range_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, s32 low, s32 high), TP_ARGS(wiphy, netdev, low, high), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(s32, rssi_low) __field(s32, rssi_high) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->rssi_low = low; __entry->rssi_high = high; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", range: %d - %d ", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rssi_low, __entry->rssi_high) ); TRACE_EVENT(rdev_set_cqm_txe_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate, u32 pkts, u32 intvl), TP_ARGS(wiphy, netdev, rate, pkts, intvl), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u32, rate) __field(u32, pkts) __field(u32, intvl) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->rate = rate; __entry->pkts = pkts; __entry->intvl = intvl; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rate: %u, packets: %u, interval: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rate, __entry->pkts, __entry->intvl) ); TRACE_EVENT(rdev_disconnect, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u16 reason_code), TP_ARGS(wiphy, netdev, reason_code), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u16, reason_code) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->reason_code = reason_code; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", reason code: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->reason_code) ); TRACE_EVENT(rdev_join_ibss, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ibss_params *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, params->bssid); memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, params->ssid, params->ssid_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, ssid: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid) ); TRACE_EVENT(rdev_join_ocb, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const struct ocb_setup *setup), TP_ARGS(wiphy, netdev, setup), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); TRACE_EVENT(rdev_set_wiphy_params, TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed), TP_ARGS(wiphy, radio_idx, changed), TP_STRUCT__entry( WIPHY_ENTRY __field(int, radio_idx) __field(u32, changed) ), TP_fast_assign( WIPHY_ASSIGN; __entry->radio_idx = radio_idx; __entry->changed = changed; ), TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u", WIPHY_PR_ARG, __entry->radio_idx, __entry->changed) ); DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), TP_ARGS(wiphy, wdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); TRACE_EVENT(rdev_get_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int radio_idx, unsigned int link_id), TP_ARGS(wiphy, wdev, radio_idx, link_id), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(int, radio_idx) __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->radio_idx = radio_idx; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", radio_idx: %d, link_id: %u", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->radio_idx, __entry->link_id) ); TRACE_EVENT(rdev_set_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm), TP_ARGS(wiphy, wdev, radio_idx, type, mbm), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(int, radio_idx) __field(enum nl80211_tx_power_setting, type) __field(int, mbm) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->radio_idx = radio_idx; __entry->type = type; __entry->mbm = mbm; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", radio_idx: %d, type: %u, mbm: %d", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->radio_idx, __entry->type, __entry->mbm) ); TRACE_EVENT(rdev_return_int_int, TP_PROTO(struct wiphy *wiphy, int func_ret, int func_fill), TP_ARGS(wiphy, func_ret, func_fill), TP_STRUCT__entry( WIPHY_ENTRY __field(int, func_ret) __field(int, func_fill) ), TP_fast_assign( WIPHY_ASSIGN; __entry->func_ret = func_ret; __entry->func_fill = func_fill; ), TP_printk(WIPHY_PR_FMT ", function returns: %d, function filled: %d", WIPHY_PR_ARG, __entry->func_ret, __entry->func_fill) ); #ifdef CONFIG_NL80211_TESTMODE TRACE_EVENT(rdev_testmode_cmd, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); TRACE_EVENT(rdev_testmode_dump, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); #endif /* CONFIG_NL80211_TESTMODE */ TRACE_EVENT(rdev_set_bitrate_mask, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask), TP_ARGS(wiphy, netdev, link_id, peer, mask), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(unsigned int, link_id) MAC_ENTRY(peer) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->link_id = link_id; MAC_ASSIGN(peer, peer); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id, __entry->peer) ); TRACE_EVENT(rdev_update_mgmt_frame_registrations, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct mgmt_frame_regs *upd), TP_ARGS(wiphy, wdev, upd), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u16, global_stypes) __field(u16, interface_stypes) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->global_stypes = upd->global_stypes; __entry->interface_stypes = upd->interface_stypes; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->global_stypes, __entry->interface_stypes) ); TRACE_EVENT(rdev_return_int_tx_rx, TP_PROTO(struct wiphy *wiphy, int ret, u32 tx, u32 rx), TP_ARGS(wiphy, ret, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) __field(u32, tx) __field(u32, rx) ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; __entry->tx = tx; __entry->rx = rx; ), TP_printk(WIPHY_PR_FMT ", returned %d, tx: %u, rx: %u", WIPHY_PR_ARG, __entry->ret, __entry->tx, __entry->rx) ); TRACE_EVENT(rdev_return_void_tx_rx, TP_PROTO(struct wiphy *wiphy, u32 tx, u32 tx_max, u32 rx, u32 rx_max), TP_ARGS(wiphy, tx, tx_max, rx, rx_max), TP_STRUCT__entry( WIPHY_ENTRY __field(u32, tx) __field(u32, tx_max) __field(u32, rx) __field(u32, rx_max) ), TP_fast_assign( WIPHY_ASSIGN; __entry->tx = tx; __entry->tx_max = tx_max; __entry->rx = rx; __entry->rx_max = rx_max; ), TP_printk(WIPHY_PR_FMT ", tx: %u, tx_max: %u, rx: %u, rx_max: %u ", WIPHY_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max) ); TRACE_EVENT(rdev_set_antenna, TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx), TP_ARGS(wiphy, radio_idx, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY __field(int, radio_idx) __field(u32, tx) __field(u32, rx) ), TP_fast_assign( WIPHY_ASSIGN; __entry->radio_idx = radio_idx; __entry->tx = tx; __entry->rx = rx; ), TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ", WIPHY_PR_ARG, __entry->radio_idx, __entry->tx, __entry->rx) ); DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), TP_ARGS(wiphy, netdev, id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u64, id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->id = id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id) ); DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), TP_ARGS(wiphy, netdev, id) ); DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_stop, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), TP_ARGS(wiphy, netdev, id) ); TRACE_EVENT(rdev_tdls_mgmt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len), TP_ARGS(wiphy, netdev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(int, link_id) __field(u8, action_code) __field(u8, dialog_token) __field(u16, status_code) __field(u32, peer_capability) __field(bool, initiator) __dynamic_array(u8, buf, len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->link_id = link_id; __entry->action_code = action_code; __entry->dialog_token = dialog_token; __entry->status_code = status_code; __entry->peer_capability = peer_capability; __entry->initiator = initiator; memcpy(__get_dynamic_array(buf), buf, len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM" ", link_id: %d, action_code: %u " "dialog_token: %u, status_code: %u, peer_capability: %u " "initiator: %s buf: %#.2x ", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->link_id, __entry->action_code, __entry->dialog_token, __entry->status_code, __entry->peer_capability, BOOL_TO_STR(__entry->initiator), ((u8 *)__get_dynamic_array(buf))[0]) ); TRACE_EVENT(rdev_dump_survey, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx), TP_ARGS(wiphy, netdev, _idx), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(int, idx) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->idx = _idx; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx) ); TRACE_EVENT(rdev_return_int_survey_info, TP_PROTO(struct wiphy *wiphy, int ret, struct survey_info *info), TP_ARGS(wiphy, ret, info), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY __field(int, ret) __field(u64, time) __field(u64, time_busy) __field(u64, time_ext_busy) __field(u64, time_rx) __field(u64, time_tx) __field(u64, time_scan) __field(u32, filled) __field(s8, noise) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(info->channel); __entry->ret = ret; __entry->time = info->time; __entry->time_busy = info->time_busy; __entry->time_ext_busy = info->time_ext_busy; __entry->time_rx = info->time_rx; __entry->time_tx = info->time_tx; __entry->time_scan = info->time_scan; __entry->filled = info->filled; __entry->noise = info->noise; ), TP_printk(WIPHY_PR_FMT ", returned: %d, " CHAN_PR_FMT ", channel time: %llu, channel time busy: %llu, " "channel time extension busy: %llu, channel time rx: %llu, " "channel time tx: %llu, scan time: %llu, filled: %u, noise: %d", WIPHY_PR_ARG, __entry->ret, CHAN_PR_ARG, __entry->time, __entry->time_busy, __entry->time_ext_busy, __entry->time_rx, __entry->time_tx, __entry->time_scan, __entry->filled, __entry->noise) ); TRACE_EVENT(rdev_tdls_oper, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *peer, enum nl80211_tdls_operation oper), TP_ARGS(wiphy, netdev, peer, oper), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(enum nl80211_tdls_operation, oper) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->oper = oper; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, oper: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper) ); DECLARE_EVENT_CLASS(rdev_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_pmksa *pmksa), TP_ARGS(wiphy, netdev, pmksa), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, pmksa->bssid); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid) ); TRACE_EVENT(rdev_probe_client, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer), TP_ARGS(wiphy, netdev, peer), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer) ); DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_pmksa *pmksa), TP_ARGS(wiphy, netdev, pmksa) ); DEFINE_EVENT(rdev_pmksa, rdev_del_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_pmksa *pmksa), TP_ARGS(wiphy, netdev, pmksa) ); TRACE_EVENT(rdev_remain_on_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration), TP_ARGS(wiphy, wdev, chan, duration), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY CHAN_ENTRY __field(unsigned int, duration) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; CHAN_ASSIGN(chan); __entry->duration = duration; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", duration: %u", WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, __entry->duration) ); TRACE_EVENT(rdev_return_int_cookie, TP_PROTO(struct wiphy *wiphy, int ret, u64 cookie), TP_ARGS(wiphy, ret, cookie), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", returned %d, cookie: %llu", WIPHY_PR_ARG, __entry->ret, __entry->cookie) ); TRACE_EVENT(rdev_cancel_remain_on_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie) ); TRACE_EVENT(rdev_mgmt_tx, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params), TP_ARGS(wiphy, wdev, params), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY CHAN_ENTRY __field(bool, offchan) __field(unsigned int, wait) __field(bool, no_cck) __field(bool, dont_wait_for_ack) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; CHAN_ASSIGN(params->chan); __entry->offchan = params->offchan; __entry->wait = params->wait; __entry->no_cck = params->no_cck; __entry->dont_wait_for_ack = params->dont_wait_for_ack; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", offchan: %s," " wait: %u, no cck: %s, dont wait for ack: %s", WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, BOOL_TO_STR(__entry->offchan), __entry->wait, BOOL_TO_STR(__entry->no_cck), BOOL_TO_STR(__entry->dont_wait_for_ack)) ); TRACE_EVENT(rdev_tx_control_port, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *buf, size_t len, const u8 *dest, __be16 proto, bool unencrypted, int link_id), TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dest) __field(__be16, proto) __field(bool, unencrypted) __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dest, dest); __entry->proto = proto; __entry->unencrypted = unencrypted; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM," " proto: 0x%x, unencrypted: %s, link: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest, be16_to_cpu(__entry->proto), BOOL_TO_STR(__entry->unencrypted), __entry->link_id) ); TRACE_EVENT(rdev_set_noack_map, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u16 noack_map), TP_ARGS(wiphy, netdev, noack_map), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u16, noack_map) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->noack_map = noack_map; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map) ); DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), TP_ARGS(wiphy, wdev, link_id) ); TRACE_EVENT(rdev_return_chandef, TP_PROTO(struct wiphy *wiphy, int ret, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, ret, chandef), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; if (ret == 0) CHAN_DEF_ASSIGN(chandef); else CHAN_DEF_ASSIGN((struct cfg80211_chan_def *)NULL); __entry->ret = ret; ), TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", ret: %d", WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->ret) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_start_p2p_device, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_start_nan, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf), TP_ARGS(wiphy, wdev, conf), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) __field(u8, bands) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->master_pref = conf->master_pref; __entry->bands = conf->bands; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", master preference: %u, bands: 0x%0x", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, __entry->bands) ); TRACE_EVENT(rdev_nan_change_conf, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes), TP_ARGS(wiphy, wdev, conf, changes), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u8, master_pref) __field(u8, bands) __field(u32, changes) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->master_pref = conf->master_pref; __entry->bands = conf->bands; __entry->changes = changes; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", master preference: %u, bands: 0x%0x, changes: %x", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, __entry->bands, __entry->changes) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_add_nan_func, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, const struct cfg80211_nan_func *func), TP_ARGS(wiphy, wdev, func), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u8, func_type) __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->func_type = func->type; __entry->cookie = func->cookie ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type, __entry->cookie) ); TRACE_EVENT(rdev_del_nan_func, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie) ); TRACE_EVENT(rdev_set_mac_acl, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_acl_data *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u32, acl_policy) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->acl_policy = params->acl_policy; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy) ); TRACE_EVENT(rdev_update_ft_ies, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_update_ft_ies_params *ftie), TP_ARGS(wiphy, netdev, ftie), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u16, md) __dynamic_array(u8, ie, ftie->ie_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->md = ftie->md; memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md) ); TRACE_EVENT(rdev_crit_proto_start, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration), TP_ARGS(wiphy, wdev, protocol, duration), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u16, proto) __field(u16, duration) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->proto = protocol; __entry->duration = duration; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration) ); TRACE_EVENT(rdev_crit_proto_stop, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); TRACE_EVENT(rdev_channel_switch, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_csa_settings *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(bool, radar_required) __field(bool, block_tx) __field(u8, count) __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon) __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp) __field(u8, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(&params->chandef); __entry->radar_required = params->radar_required; __entry->block_tx = params->block_tx; __entry->count = params->count; memcpy(__get_dynamic_array(bcn_ofs), params->counter_offsets_beacon, params->n_counter_offsets_beacon * sizeof(u16)); /* probe response offsets are optional */ if (params->n_counter_offsets_presp) memcpy(__get_dynamic_array(pres_ofs), params->counter_offsets_presp, params->n_counter_offsets_presp * sizeof(u16)); __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", block_tx: %d, count: %u, radar_required: %d, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->block_tx, __entry->count, __entry->radar_required, __entry->link_id) ); TRACE_EVENT(rdev_set_qos_map, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_qos_map *qos_map), TP_ARGS(wiphy, netdev, qos_map), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY QOS_MAP_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; QOS_MAP_ASSIGN(qos_map); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des) ); TRACE_EVENT(rdev_set_ap_chanwidth, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, unsigned int link_id, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, netdev, link_id, chandef), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(rdev_add_tx_ts, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(u8, tsid) __field(u8, user_prio) __field(u16, admitted_time) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->tsid = tsid; __entry->user_prio = user_prio; __entry->admitted_time = admitted_time; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d, UP %d, time %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid, __entry->user_prio, __entry->admitted_time) ); TRACE_EVENT(rdev_del_tx_ts, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 tsid, const u8 *peer), TP_ARGS(wiphy, netdev, tsid, peer), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(u8, tsid) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->tsid = tsid; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid) ); TRACE_EVENT(rdev_tdls_channel_switch, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, netdev, addr, oper_class, chandef), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(addr) __field(u8, oper_class) CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); CHAN_DEF_ASSIGN(chandef); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM" " oper class %d, " CHAN_DEF_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr, __entry->oper_class, CHAN_DEF_PR_ARG) ); TRACE_EVENT(rdev_tdls_cancel_channel_switch, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *addr), TP_ARGS(wiphy, netdev, addr), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(addr) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr) ); TRACE_EVENT(rdev_set_pmk, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_pmk_conf *pmk_conf), TP_ARGS(wiphy, netdev, pmk_conf), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(aa) __field(u8, pmk_len) __field(u8, pmk_r0_name_len) __dynamic_array(u8, pmk, pmk_conf->pmk_len) __dynamic_array(u8, pmk_r0_name, WLAN_PMK_NAME_LEN) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(aa, pmk_conf->aa); __entry->pmk_len = pmk_conf->pmk_len; __entry->pmk_r0_name_len = pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0; memcpy(__get_dynamic_array(pmk), pmk_conf->pmk, pmk_conf->pmk_len); memcpy(__get_dynamic_array(pmk_r0_name), pmk_conf->pmk_r0_name, pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM" "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->aa, __entry->pmk_len, __print_array(__get_dynamic_array(pmk), __get_dynamic_array_len(pmk), 1), __entry->pmk_r0_name_len ? __print_array(__get_dynamic_array(pmk_r0_name), __get_dynamic_array_len(pmk_r0_name), 1) : "") ); TRACE_EVENT(rdev_del_pmk, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *aa), TP_ARGS(wiphy, netdev, aa), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(aa) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(aa, aa); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->aa) ); TRACE_EVENT(rdev_external_auth, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_external_auth_params *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry(WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(bssid) __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1) __field(u16, status) MAC_ENTRY(mld_addr) ), TP_fast_assign(WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(bssid, params->bssid); memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, params->ssid.ssid, params->ssid.ssid_len); __entry->status = params->status; MAC_ASSIGN(mld_addr, params->mld_addr); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", ssid: %s, status: %u, mld_addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid, __entry->status, __entry->mld_addr) ); TRACE_EVENT(rdev_start_radar_detection, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id), TP_ARGS(wiphy, netdev, chandef, cac_time_ms, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(u32, cac_time_ms) __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->cac_time_ms = cac_time_ms; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", cac_time_ms=%u, link_id=%d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->cac_time_ms, __entry->link_id) ); TRACE_EVENT(rdev_set_mcast_rate, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int *mcast_rate), TP_ARGS(wiphy, netdev, mcast_rate), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __array(int, mcast_rate, NUM_NL80211_BANDS) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; memcpy(__entry->mcast_rate, mcast_rate, sizeof(int) * NUM_NL80211_BANDS); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mcast_rate[NL80211_BAND_2GHZ], __entry->mcast_rate[NL80211_BAND_5GHZ], __entry->mcast_rate[NL80211_BAND_6GHZ], __entry->mcast_rate[NL80211_BAND_60GHZ]) ); TRACE_EVENT(rdev_set_coalesce, TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce), TP_ARGS(wiphy, coalesce), TP_STRUCT__entry( WIPHY_ENTRY __field(int, n_rules) ), TP_fast_assign( WIPHY_ASSIGN; __entry->n_rules = coalesce ? coalesce->n_rules : 0; ), TP_printk(WIPHY_PR_FMT ", n_rules=%d", WIPHY_PR_ARG, __entry->n_rules) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_set_multicast_to_unicast, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const bool enabled), TP_ARGS(wiphy, netdev, enabled), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(bool, enabled) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->enabled = enabled; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, BOOL_TO_STR(__entry->enabled)) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_get_ftm_responder_stats, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ftm_responder_stats *ftm_stats), TP_ARGS(wiphy, netdev, ftm_stats), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u64, timestamp) __field(u32, success_num) __field(u32, partial_num) __field(u32, failed_num) __field(u32, asap_num) __field(u32, non_asap_num) __field(u64, duration) __field(u32, unknown_triggers) __field(u32, reschedule) __field(u32, out_of_window) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->success_num = ftm_stats->success_num; __entry->partial_num = ftm_stats->partial_num; __entry->failed_num = ftm_stats->failed_num; __entry->asap_num = ftm_stats->asap_num; __entry->non_asap_num = ftm_stats->non_asap_num; __entry->duration = ftm_stats->total_duration_ms; __entry->unknown_triggers = ftm_stats->unknown_triggers_num; __entry->reschedule = ftm_stats->reschedule_requests_num; __entry->out_of_window = ftm_stats->out_of_window_triggers_num; ), TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, " "failed %u, asap %u, non asap %u, total duration %llu, unknown " "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG, __entry->success_num, __entry->partial_num, __entry->failed_num, __entry->asap_num, __entry->non_asap_num, __entry->duration, __entry->unknown_triggers, __entry->reschedule, __entry->out_of_window) ); DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie) ); DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie) ); TRACE_EVENT(rdev_set_fils_aad, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_fils_aad *fils_aad), TP_ARGS(wiphy, netdev, fils_aad), TP_STRUCT__entry(WIPHY_ENTRY NETDEV_ENTRY __array(u8, macaddr, ETH_ALEN) __field(u8, kek_len) ), TP_fast_assign(WIPHY_ASSIGN; NETDEV_ASSIGN; FILS_AAD_ASSIGN(fils_aad); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " FILS_AAD_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, __entry->kek_len) ); TRACE_EVENT(rdev_update_owe_info, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_update_owe_info *owe_info), TP_ARGS(wiphy, netdev, owe_info), TP_STRUCT__entry(WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(u16, status) __dynamic_array(u8, ie, owe_info->ie_len)), TP_fast_assign(WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, owe_info->peer); __entry->status = owe_info->status; memcpy(__get_dynamic_array(ie), owe_info->ie, owe_info->ie_len);), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM" " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->status) ); TRACE_EVENT(rdev_probe_mesh_link, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *dest, const u8 *buf, size_t len), TP_ARGS(wiphy, netdev, dest, buf, len), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(dest) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(dest, dest); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest) ); TRACE_EVENT(rdev_set_tid_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_tid_config *tid_conf), TP_ARGS(wiphy, netdev, tid_conf), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, tid_conf->peer); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer) ); TRACE_EVENT(rdev_reset_tid_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer, u8 tids), TP_ARGS(wiphy, netdev, peer, tids), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(u8, tids) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->tids = tids; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, tids: 0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tids) ); TRACE_EVENT(rdev_set_sar_specs, TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar), TP_ARGS(wiphy, sar), TP_STRUCT__entry( WIPHY_ENTRY __field(u16, type) __field(u16, num) ), TP_fast_assign( WIPHY_ASSIGN; __entry->type = sar->type; __entry->num = sar->num_sub_specs; ), TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d", WIPHY_PR_ARG, __entry->type, __entry->num) ); TRACE_EVENT(rdev_color_change, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_color_change_settings *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u8, count) __field(u16, bcn_ofs) __field(u16, pres_ofs) __field(u8, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->count = params->count; __entry->bcn_ofs = params->counter_offset_beacon; __entry->pres_ofs = params->counter_offset_presp; __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", count: %u, link_id: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->count, __entry->link_id) ); TRACE_EVENT(rdev_set_radar_background, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, chandef), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef) ), TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, WIPHY_PR_ARG, CHAN_DEF_PR_ARG) ); DEFINE_EVENT(wiphy_wdev_link_evt, rdev_add_intf_link, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), TP_ARGS(wiphy, wdev, link_id) ); DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), TP_ARGS(wiphy, wdev, link_id) ); TRACE_EVENT(rdev_del_link_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct link_station_del_parameters *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __array(u8, mld_mac, 6) __field(u32, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; memset(__entry->mld_mac, 0, 6); if (params->mld_mac) memcpy(__entry->mld_mac, params->mld_mac, 6); __entry->link_id = params->link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", link id: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, __entry->link_id) ); TRACE_EVENT(rdev_set_hw_timestamp, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_set_hw_timestamp *hwts), TP_ARGS(wiphy, netdev, hwts), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(macaddr) __field(bool, enable) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(macaddr, hwts->macaddr); __entry->enable = hwts->enable; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, __entry->enable) ); TRACE_EVENT(rdev_set_ttlm, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ttlm_params *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __array(u8, dlink, sizeof(u16) * 8) __array(u8, ulink, sizeof(u16) * 8) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; memcpy(__entry->dlink, params->dlink, sizeof(params->dlink)); memcpy(__entry->ulink, params->ulink, sizeof(params->ulink)); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); TRACE_EVENT(rdev_set_epcs, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, bool val), TP_ARGS(wiphy, netdev, val), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(bool, val) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->val = val; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", config=%u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->val) ); /************************************************************* * cfg80211 exported functions traces * *************************************************************/ TRACE_EVENT(cfg80211_return_bool, TP_PROTO(bool ret), TP_ARGS(ret), TP_STRUCT__entry( __field(bool, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("returned %s", BOOL_TO_STR(__entry->ret)) ); DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt, TP_PROTO(struct net_device *netdev, const u8 *macaddr), TP_ARGS(netdev, macaddr), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(macaddr) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(macaddr, macaddr); ), TP_printk(NETDEV_PR_FMT ", mac: %pM", NETDEV_PR_ARG, __entry->macaddr) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, TP_PROTO(struct net_device *netdev, const u8 *macaddr), TP_ARGS(netdev, macaddr) ); TRACE_EVENT(cfg80211_send_rx_assoc, TP_PROTO(struct net_device *netdev, const struct cfg80211_rx_assoc_resp_data *data), TP_ARGS(netdev, data), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(ap_addr) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->links[0].bss->bssid); ), TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->ap_addr) ); DECLARE_EVENT_CLASS(netdev_frame_event, TP_PROTO(struct net_device *netdev, const u8 *buf, int len), TP_ARGS(netdev, buf, len), TP_STRUCT__entry( NETDEV_ENTRY __dynamic_array(u8, frame, len) ), TP_fast_assign( NETDEV_ASSIGN; memcpy(__get_dynamic_array(frame), buf, len); ), TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x", NETDEV_PR_ARG, le16_to_cpup((__le16 *)__get_dynamic_array(frame))) ); DEFINE_EVENT(netdev_frame_event, cfg80211_rx_unprot_mlme_mgmt, TP_PROTO(struct net_device *netdev, const u8 *buf, int len), TP_ARGS(netdev, buf, len) ); DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt, TP_PROTO(struct net_device *netdev, const u8 *buf, int len), TP_ARGS(netdev, buf, len) ); TRACE_EVENT(cfg80211_tx_mlme_mgmt, TP_PROTO(struct net_device *netdev, const u8 *buf, int len, bool reconnect), TP_ARGS(netdev, buf, len, reconnect), TP_STRUCT__entry( NETDEV_ENTRY __dynamic_array(u8, frame, len) __field(int, reconnect) ), TP_fast_assign( NETDEV_ASSIGN; memcpy(__get_dynamic_array(frame), buf, len); __entry->reconnect = reconnect; ), TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x reconnect:%d", NETDEV_PR_ARG, le16_to_cpup((__le16 *)__get_dynamic_array(frame)), __entry->reconnect) ); DECLARE_EVENT_CLASS(netdev_mac_evt, TP_PROTO(struct net_device *netdev, const u8 *mac), TP_ARGS(netdev, mac), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(mac) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(mac, mac) ), TP_printk(NETDEV_PR_FMT ", mac: %pM", NETDEV_PR_ARG, __entry->mac) ); DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout, TP_PROTO(struct net_device *netdev, const u8 *mac), TP_ARGS(netdev, mac) ); TRACE_EVENT(cfg80211_send_assoc_failure, TP_PROTO(struct net_device *netdev, struct cfg80211_assoc_failure *data), TP_ARGS(netdev, data), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(ap_addr) __field(bool, timeout) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid); __entry->timeout = data->timeout; ), TP_printk(NETDEV_PR_FMT ", mac: %pM, timeout: %d", NETDEV_PR_ARG, __entry->ap_addr, __entry->timeout) ); TRACE_EVENT(cfg80211_michael_mic_failure, TP_PROTO(struct net_device *netdev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc), TP_ARGS(netdev, addr, key_type, key_id, tsc), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(addr) __field(enum nl80211_key_type, key_type) __field(int, key_id) __array(u8, tsc, 6) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); __entry->key_type = key_type; __entry->key_id = key_id; if (tsc) memcpy(__entry->tsc, tsc, 6); ), TP_printk(NETDEV_PR_FMT ", %pM, key type: %d, key id: %d, tsc: %pm", NETDEV_PR_ARG, __entry->addr, __entry->key_type, __entry->key_id, __entry->tsc) ); TRACE_EVENT(cfg80211_ready_on_channel, TP_PROTO(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan, unsigned int duration), TP_ARGS(wdev, cookie, chan, duration), TP_STRUCT__entry( WDEV_ENTRY __field(u64, cookie) CHAN_ENTRY __field(unsigned int, duration) ), TP_fast_assign( WDEV_ASSIGN; __entry->cookie = cookie; CHAN_ASSIGN(chan); __entry->duration = duration; ), TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT ", duration: %u", WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG, __entry->duration) ); TRACE_EVENT(cfg80211_ready_on_channel_expired, TP_PROTO(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan), TP_ARGS(wdev, cookie, chan), TP_STRUCT__entry( WDEV_ENTRY __field(u64, cookie) CHAN_ENTRY ), TP_fast_assign( WDEV_ASSIGN; __entry->cookie = cookie; CHAN_ASSIGN(chan); ), TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT, WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) ); TRACE_EVENT(cfg80211_tx_mgmt_expired, TP_PROTO(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan), TP_ARGS(wdev, cookie, chan), TP_STRUCT__entry( WDEV_ENTRY __field(u64, cookie) CHAN_ENTRY ), TP_fast_assign( WDEV_ASSIGN; __entry->cookie = cookie; CHAN_ASSIGN(chan); ), TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT, WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG) ); TRACE_EVENT(cfg80211_new_sta, TP_PROTO(struct net_device *netdev, const u8 *mac_addr, struct station_info *sinfo), TP_ARGS(netdev, mac_addr, sinfo), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(mac_addr) SINFO_ENTRY ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); SINFO_ASSIGN; ), TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->mac_addr) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta, TP_PROTO(struct net_device *netdev, const u8 *macaddr), TP_ARGS(netdev, macaddr) ); TRACE_EVENT(cfg80211_rx_mgmt, TP_PROTO(struct wireless_dev *wdev, struct cfg80211_rx_info *info), TP_ARGS(wdev, info), TP_STRUCT__entry( WDEV_ENTRY __field(int, freq) __field(int, sig_dbm) ), TP_fast_assign( WDEV_ASSIGN; __entry->freq = info->freq; __entry->sig_dbm = info->sig_dbm; ), TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), TP_ARGS(wdev, cookie, ack), TP_STRUCT__entry( WDEV_ENTRY __field(u64, cookie) __field(bool, ack) ), TP_fast_assign( WDEV_ASSIGN; __entry->cookie = cookie; __entry->ack = ack; ), TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); TRACE_EVENT(cfg80211_control_port_tx_status, TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), TP_ARGS(wdev, cookie, ack), TP_STRUCT__entry( WDEV_ENTRY __field(u64, cookie) __field(bool, ack) ), TP_fast_assign( WDEV_ASSIGN; __entry->cookie = cookie; __entry->ack = ack; ), TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); TRACE_EVENT(cfg80211_rx_control_port, TP_PROTO(struct net_device *netdev, struct sk_buff *skb, bool unencrypted, int link_id), TP_ARGS(netdev, skb, unencrypted, link_id), TP_STRUCT__entry( NETDEV_ENTRY __field(int, len) MAC_ENTRY(from) __field(u16, proto) __field(bool, unencrypted) __field(int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; __entry->len = skb->len; MAC_ASSIGN(from, eth_hdr(skb)->h_source); __entry->proto = be16_to_cpu(skb->protocol); __entry->unencrypted = unencrypted; __entry->link_id = link_id; ), TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s, link: %d", NETDEV_PR_ARG, __entry->len, __entry->from, __entry->proto, BOOL_TO_STR(__entry->unencrypted), __entry->link_id) ); TRACE_EVENT(cfg80211_cqm_rssi_notify, TP_PROTO(struct net_device *netdev, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level), TP_ARGS(netdev, rssi_event, rssi_level), TP_STRUCT__entry( NETDEV_ENTRY __field(enum nl80211_cqm_rssi_threshold_event, rssi_event) __field(s32, rssi_level) ), TP_fast_assign( NETDEV_ASSIGN; __entry->rssi_event = rssi_event; __entry->rssi_level = rssi_level; ), TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d", NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level) ); TRACE_EVENT(cfg80211_reg_can_beacon, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, enum nl80211_iftype iftype, u32 prohibited_flags, u32 permitting_flags), TP_ARGS(wiphy, chandef, iftype, prohibited_flags, permitting_flags), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY __field(enum nl80211_iftype, iftype) __field(u32, prohibited_flags) __field(u32, permitting_flags) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->iftype = iftype; __entry->prohibited_flags = prohibited_flags; __entry->permitting_flags = permitting_flags; ), TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d prohibited_flags=0x%x permitting_flags=0x%x", WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype, __entry->prohibited_flags, __entry->permitting_flags) ); TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, unsigned int link_id), TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; ), TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_ch_switch_started_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, unsigned int link_id), TP_ARGS(netdev, chandef, link_id), TP_STRUCT__entry( NETDEV_ENTRY CHAN_DEF_ENTRY __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->link_id = link_id; ), TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d", NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_radar_event, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, bool offchan), TP_ARGS(wiphy, chandef, offchan), TP_STRUCT__entry( WIPHY_ENTRY CHAN_DEF_ENTRY __field(bool, offchan) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->offchan = offchan; ), TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", offchan %d", WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->offchan) ); TRACE_EVENT(cfg80211_cac_event, TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt, unsigned int link_id), TP_ARGS(netdev, evt, link_id), TP_STRUCT__entry( NETDEV_ENTRY __field(enum nl80211_radar_event, evt) __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; __entry->evt = evt; __entry->link_id = link_id; ), TP_printk(NETDEV_PR_FMT ", event: %d, link_id=%u", NETDEV_PR_ARG, __entry->evt, __entry->link_id) ); DECLARE_EVENT_CLASS(cfg80211_rx_evt, TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), TP_ARGS(netdev, addr, link_id), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(addr) __field(int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); __entry->link_id = link_id; ), TP_printk(NETDEV_PR_FMT ", %pM, link_id:%d", NETDEV_PR_ARG, __entry->addr, __entry->link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame, TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), TP_ARGS(netdev, addr, link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame, TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), TP_ARGS(netdev, addr, link_id) ); TRACE_EVENT(cfg80211_ibss_joined, TP_PROTO(struct net_device *netdev, const u8 *bssid, struct ieee80211_channel *channel), TP_ARGS(netdev, bssid, channel), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(bssid) CHAN_ENTRY ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(bssid, bssid); CHAN_ASSIGN(channel); ), TP_printk(NETDEV_PR_FMT ", bssid: %pM, " CHAN_PR_FMT, NETDEV_PR_ARG, __entry->bssid, CHAN_PR_ARG) ); TRACE_EVENT(cfg80211_probe_status, TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie, bool acked), TP_ARGS(netdev, addr, cookie, acked), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(addr) __field(u64, cookie) __field(bool, acked) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); __entry->cookie = cookie; __entry->acked = acked; ), TP_printk(NETDEV_PR_FMT " addr:%pM, cookie: %llu, acked: %s", NETDEV_PR_ARG, __entry->addr, __entry->cookie, BOOL_TO_STR(__entry->acked)) ); TRACE_EVENT(cfg80211_cqm_pktloss_notify, TP_PROTO(struct net_device *netdev, const u8 *peer, u32 num_packets), TP_ARGS(netdev, peer, num_packets), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(peer) __field(u32, num_packets) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->num_packets = num_packets; ), TP_printk(NETDEV_PR_FMT ", peer: %pM, num of lost packets: %u", NETDEV_PR_ARG, __entry->peer, __entry->num_packets) ); DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify, TP_PROTO(struct net_device *netdev, const u8 *macaddr), TP_ARGS(netdev, macaddr) ); TRACE_EVENT(cfg80211_pmksa_candidate_notify, TP_PROTO(struct net_device *netdev, int index, const u8 *bssid, bool preauth), TP_ARGS(netdev, index, bssid, preauth), TP_STRUCT__entry( NETDEV_ENTRY __field(int, index) MAC_ENTRY(bssid) __field(bool, preauth) ), TP_fast_assign( NETDEV_ASSIGN; __entry->index = index; MAC_ASSIGN(bssid, bssid); __entry->preauth = preauth; ), TP_printk(NETDEV_PR_FMT ", index:%d, bssid: %pM, pre auth: %s", NETDEV_PR_ARG, __entry->index, __entry->bssid, BOOL_TO_STR(__entry->preauth)) ); TRACE_EVENT(cfg80211_report_obss_beacon, TP_PROTO(struct wiphy *wiphy, const u8 *frame, size_t len, int freq, int sig_dbm), TP_ARGS(wiphy, frame, len, freq, sig_dbm), TP_STRUCT__entry( WIPHY_ENTRY __field(int, freq) __field(int, sig_dbm) ), TP_fast_assign( WIPHY_ASSIGN; __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d", WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_tdls_oper_request, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer, enum nl80211_tdls_operation oper, u16 reason_code), TP_ARGS(wiphy, netdev, peer, oper, reason_code), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __field(enum nl80211_tdls_operation, oper) __field(u16, reason_code) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, peer); __entry->oper = oper; __entry->reason_code = reason_code; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, oper: %d, reason_code %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper, __entry->reason_code) ); TRACE_EVENT(cfg80211_scan_done, TP_PROTO(struct cfg80211_scan_request_int *request, struct cfg80211_scan_info *info), TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->req.ie_len : 0) __array(u32, rates, NUM_NL80211_BANDS) __field(u32, wdev_id) MAC_ENTRY(wiphy_mac) __field(bool, no_cck) __field(bool, aborted) __field(u64, scan_start_tsf) MAC_ENTRY(tsf_bssid) ), TP_fast_assign( if (request) { memcpy(__get_dynamic_array(ie), request->req.ie, request->req.ie_len); memcpy(__entry->rates, request->req.rates, NUM_NL80211_BANDS); __entry->wdev_id = request->req.wdev ? request->req.wdev->identifier : 0; if (request->req.wiphy) MAC_ASSIGN(wiphy_mac, request->req.wiphy->perm_addr); __entry->no_cck = request->req.no_cck; } if (info) { __entry->aborted = info->aborted; __entry->scan_start_tsf = info->scan_start_tsf; MAC_ASSIGN(tsf_bssid, info->tsf_bssid); } ), TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: %pM", BOOL_TO_STR(__entry->aborted), (unsigned long long)__entry->scan_start_tsf, __entry->tsf_bssid) ); DECLARE_EVENT_CLASS(wiphy_id_evt, TP_PROTO(struct wiphy *wiphy, u64 id), TP_ARGS(wiphy, id), TP_STRUCT__entry( WIPHY_ENTRY __field(u64, id) ), TP_fast_assign( WIPHY_ASSIGN; __entry->id = id; ), TP_printk(WIPHY_PR_FMT ", id: %llu", WIPHY_PR_ARG, __entry->id) ); DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_stopped, TP_PROTO(struct wiphy *wiphy, u64 id), TP_ARGS(wiphy, id) ); DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_results, TP_PROTO(struct wiphy *wiphy, u64 id), TP_ARGS(wiphy, id) ); TRACE_EVENT(cfg80211_get_bss, TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, enum ieee80211_bss_type bss_type, enum ieee80211_privacy privacy), TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY MAC_ENTRY(bssid) __dynamic_array(u8, ssid, ssid_len) __field(enum ieee80211_bss_type, bss_type) __field(enum ieee80211_privacy, privacy) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(channel); MAC_ASSIGN(bssid, bssid); memcpy(__get_dynamic_array(ssid), ssid, ssid_len); __entry->bss_type = bss_type; __entry->privacy = privacy; ), TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", %pM" ", buf: %#.2x, bss_type: %d, privacy: %d", WIPHY_PR_ARG, CHAN_PR_ARG, __entry->bssid, ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type, __entry->privacy) ); TRACE_EVENT(cfg80211_inform_bss_frame, TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len), TP_ARGS(wiphy, data, mgmt, len), TP_STRUCT__entry( WIPHY_ENTRY CHAN_ENTRY __dynamic_array(u8, mgmt, len) __field(s32, signal) __field(u64, ts_boottime) __field(u64, parent_tsf) MAC_ENTRY(parent_bssid) ), TP_fast_assign( WIPHY_ASSIGN; CHAN_ASSIGN(data->chan); if (mgmt) memcpy(__get_dynamic_array(mgmt), mgmt, len); __entry->signal = data->signal; __entry->ts_boottime = data->boottime_ns; __entry->parent_tsf = data->parent_tsf; MAC_ASSIGN(parent_bssid, data->parent_bssid); ), TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM", WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal, (unsigned long long)__entry->ts_boottime, (unsigned long long)__entry->parent_tsf, __entry->parent_bssid) ); DECLARE_EVENT_CLASS(cfg80211_bss_evt, TP_PROTO(struct cfg80211_bss *pub), TP_ARGS(pub), TP_STRUCT__entry( MAC_ENTRY(bssid) CHAN_ENTRY ), TP_fast_assign( MAC_ASSIGN(bssid, pub->bssid); CHAN_ASSIGN(pub->channel); ), TP_printk("%pM, " CHAN_PR_FMT, __entry->bssid, CHAN_PR_ARG) ); DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss, TP_PROTO(struct cfg80211_bss *pub), TP_ARGS(pub) ); TRACE_EVENT(cfg80211_report_wowlan_wakeup, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup), TP_ARGS(wiphy, wdev, wakeup), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(bool, non_wireless) __field(bool, disconnect) __field(bool, magic_pkt) __field(bool, gtk_rekey_failure) __field(bool, eap_identity_req) __field(bool, four_way_handshake) __field(bool, rfkill_release) __field(s32, pattern_idx) __field(u32, packet_len) __dynamic_array(u8, packet, wakeup ? wakeup->packet_present_len : 0) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->non_wireless = !wakeup; __entry->disconnect = wakeup ? wakeup->disconnect : false; __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false; __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false; __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false; __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false; __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false; __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false; __entry->packet_len = wakeup ? wakeup->packet_len : false; if (wakeup && wakeup->packet && wakeup->packet_present_len) memcpy(__get_dynamic_array(packet), wakeup->packet, wakeup->packet_present_len); ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); TRACE_EVENT(cfg80211_ft_event, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ft_event_params *ft_event), TP_ARGS(wiphy, netdev, ft_event), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __dynamic_array(u8, ies, ft_event->ies_len) MAC_ENTRY(target_ap) __dynamic_array(u8, ric_ies, ft_event->ric_ies_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; if (ft_event->ies) memcpy(__get_dynamic_array(ies), ft_event->ies, ft_event->ies_len); MAC_ASSIGN(target_ap, ft_event->target_ap); if (ft_event->ric_ies) memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies, ft_event->ric_ies_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap) ); TRACE_EVENT(cfg80211_stop_link, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id), TP_ARGS(wiphy, wdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d", WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_pmsr_report, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie, const u8 *addr), TP_ARGS(wiphy, wdev, cookie, addr), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) MAC_ENTRY(addr) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; MAC_ASSIGN(addr, addr); ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, %pM", WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie, __entry->addr) ); TRACE_EVENT(cfg80211_pmsr_complete, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld", WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie) ); TRACE_EVENT(cfg80211_update_owe_info_event, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_update_owe_info *owe_info), TP_ARGS(wiphy, netdev, owe_info), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(peer) __dynamic_array(u8, ie, owe_info->ie_len) __field(int, assoc_link_id) MAC_ENTRY(peer_mld_addr) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(peer, owe_info->peer); memcpy(__get_dynamic_array(ie), owe_info->ie, owe_info->ie_len); __entry->assoc_link_id = owe_info->assoc_link_id; MAC_ASSIGN(peer_mld_addr, owe_info->peer_mld_addr); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM," " assoc_link_id: %d, peer_mld_addr: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->assoc_link_id, __entry->peer_mld_addr) ); TRACE_EVENT(cfg80211_bss_color_notify, TP_PROTO(struct net_device *netdev, enum nl80211_commands cmd, u8 count, u64 color_bitmap), TP_ARGS(netdev, cmd, count, color_bitmap), TP_STRUCT__entry( NETDEV_ENTRY __field(u32, cmd) __field(u8, count) __field(u64, color_bitmap) ), TP_fast_assign( NETDEV_ASSIGN; __entry->cmd = cmd; __entry->count = count; __entry->color_bitmap = color_bitmap; ), TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx", NETDEV_PR_ARG, __entry->cmd, __entry->count, __entry->color_bitmap) ); TRACE_EVENT(cfg80211_assoc_comeback, TP_PROTO(struct wireless_dev *wdev, const u8 *ap_addr, u32 timeout), TP_ARGS(wdev, ap_addr, timeout), TP_STRUCT__entry( WDEV_ENTRY MAC_ENTRY(ap_addr) __field(u32, timeout) ), TP_fast_assign( WDEV_ASSIGN; MAC_ASSIGN(ap_addr, ap_addr); __entry->timeout = timeout; ), TP_printk(WDEV_PR_FMT ", %pM, timeout: %u TUs", WDEV_PR_ARG, __entry->ap_addr, __entry->timeout) ); DECLARE_EVENT_CLASS(link_station_add_mod, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct link_station_parameters *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __array(u8, mld_mac, 6) __array(u8, link_mac, 6) __field(u32, link_id) __dynamic_array(u8, supported_rates, params->supported_rates_len) __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) __field(u8, opmode_notif) __field(bool, opmode_notif_used) __dynamic_array(u8, he_capa, params->he_capa_len) __array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa)) __dynamic_array(u8, eht_capa, params->eht_capa_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; memset(__entry->mld_mac, 0, 6); memset(__entry->link_mac, 0, 6); if (params->mld_mac) memcpy(__entry->mld_mac, params->mld_mac, 6); if (params->link_mac) memcpy(__entry->link_mac, params->link_mac, 6); __entry->link_id = params->link_id; if (params->supported_rates && params->supported_rates_len) memcpy(__get_dynamic_array(supported_rates), params->supported_rates, params->supported_rates_len); memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap)); if (params->ht_capa) memcpy(__entry->ht_capa, params->ht_capa, sizeof(struct ieee80211_ht_cap)); memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); if (params->vht_capa) memcpy(__entry->vht_capa, params->vht_capa, sizeof(struct ieee80211_vht_cap)); __entry->opmode_notif = params->opmode_notif; __entry->opmode_notif_used = params->opmode_notif_used; if (params->he_capa && params->he_capa_len) memcpy(__get_dynamic_array(he_capa), params->he_capa, params->he_capa_len); memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa)); if (params->he_6ghz_capa) memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa, sizeof(struct ieee80211_he_6ghz_capa)); if (params->eht_capa && params->eht_capa_len) memcpy(__get_dynamic_array(eht_capa), params->eht_capa, params->eht_capa_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM" ", link mac: %pM, link id: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac, __entry->link_mac, __entry->link_id) ); DEFINE_EVENT(link_station_add_mod, rdev_add_link_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct link_station_parameters *params), TP_ARGS(wiphy, netdev, params) ); DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct link_station_parameters *params), TP_ARGS(wiphy, netdev, params) ); TRACE_EVENT(cfg80211_links_removed, TP_PROTO(struct net_device *netdev, u16 link_mask), TP_ARGS(netdev, link_mask), TP_STRUCT__entry( NETDEV_ENTRY __field(u16, link_mask) ), TP_fast_assign( NETDEV_ASSIGN; __entry->link_mask = link_mask; ), TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", NETDEV_PR_ARG, __entry->link_mask) ); TRACE_EVENT(cfg80211_mlo_reconf_add_done, TP_PROTO(struct net_device *netdev, u16 link_mask, const u8 *buf, size_t len, bool driver_initiated), TP_ARGS(netdev, link_mask, buf, len, driver_initiated), TP_STRUCT__entry( NETDEV_ENTRY __field(u16, link_mask) __dynamic_array(u8, buf, len) __field(bool, driver_initiated) ), TP_fast_assign( NETDEV_ASSIGN; __entry->link_mask = link_mask; memcpy(__get_dynamic_array(buf), buf, len); __entry->driver_initiated = driver_initiated; ), TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d", NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated) ); TRACE_EVENT(rdev_assoc_ml_reconf, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ml_reconf_req *req), TP_ARGS(wiphy, netdev, req), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u16, add_links) __field(u16, rem_links) __field(u16, ext_mld_capa_ops) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; u32 i; __entry->add_links = 0; __entry->rem_links = req->rem_links; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) if (req->add_links[i].bss) __entry->add_links |= BIT(i); __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->add_links, __entry->rem_links) ); TRACE_EVENT(cfg80211_epcs_changed, TP_PROTO(struct wireless_dev *wdev, bool enabled), TP_ARGS(wdev, enabled), TP_STRUCT__entry( WDEV_ENTRY __field(u32, enabled) ), TP_fast_assign( WDEV_ASSIGN; __entry->enabled = enabled; ), TP_printk(WDEV_PR_FMT ", enabled=%u", WDEV_PR_ARG, __entry->enabled) ); TRACE_EVENT(cfg80211_next_nan_dw_notif, TP_PROTO(struct wireless_dev *wdev, struct ieee80211_channel *chan), TP_ARGS(wdev, chan), TP_STRUCT__entry( WDEV_ENTRY CHAN_ENTRY ), TP_fast_assign( WDEV_ASSIGN; CHAN_ASSIGN(chan); ), TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, WDEV_PR_ARG, CHAN_PR_ARG) ); TRACE_EVENT(cfg80211_nan_cluster_joined, TP_PROTO(struct wireless_dev *wdev, const u8 *cluster_id, bool new_cluster), TP_ARGS(wdev, cluster_id, new_cluster), TP_STRUCT__entry( WDEV_ENTRY MAC_ENTRY(cluster_id) __field(bool, new_cluster) ), TP_fast_assign( WDEV_ASSIGN; MAC_ASSIGN(cluster_id, cluster_id); __entry->new_cluster = new_cluster; ), TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", WDEV_PR_ARG, __entry->cluster_id, __entry->new_cluster ? " [new]" : "") ); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
6 5 6 6 6 6 6 6 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @bio: the target &bio * * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } static inline bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */
326 471 427 68 531 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_POLL_H #define _LINUX_POLL_H #include <linux/compiler.h> #include <linux/ktime.h> #include <linux/wait.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <uapi/linux/poll.h> #include <uapi/linux/eventpoll.h> /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #define MAX_STACK_ALLOC 832 #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC #define WQUEUES_STACK_ALLOC (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC) #define N_INLINE_POLL_ENTRIES (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry)) #define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM) struct poll_table_struct; /* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* * Do not touch the structure directly, use the access function * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; __poll_t _key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). * See the comment above prepare_to_wait(), we need to * ensure that subsequent tests in this thread can't be * reordered with __add_wait_queue() in _qproc() paths. */ smp_mb(); } } /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has * to be started implicitly on poll(). You typically only want to do that * if the application is actually polling for POLLIN and/or POLLOUT. */ static inline __poll_t poll_requested_events(const poll_table *p) { return p ? p->_key : ~(__poll_t)0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->_qproc = qproc; pt->_key = ~(__poll_t)0; /* all events enabled */ } static inline bool file_can_poll(struct file *file) { return file->f_op->poll; } static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt) { if (unlikely(!file->f_op->poll)) return DEFAULT_POLLMASK; return file->f_op->poll(file, pt); } struct poll_table_entry { struct file *filp; __poll_t key; wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; /* * Structures